From f25f21cb9fa94aef6c70036456741a715328f33f Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 13:29:19 +0200 Subject: [PATCH 01/14] refactor(context): extract nvisy-context crate + wire engine enhancer pass Lifts crate::context out of nvisy-core into a sibling nvisy-context crate so the SDK base stays primitives-only for third-party recognizer authors. Adds NerRecognizer::context_registry (mirroring PatternRegistry::context_registry) and wires ContextEnhancer into DetectionPhase: build_for_request now returns DetectionResources { recognizers, enhancer }, the enhancer runs in block-local coordinates between recognizer dispatch and modality lifting, and the substring path runs by default (Tokens artifact wiring follows when an NlpEngine is plumbed in). Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 15 +++++ Cargo.toml | 2 + crates/nvisy-context/Cargo.toml | 41 +++++++++++++ crates/nvisy-context/README.md | 44 ++++++++++++++ .../src}/declaration.rs | 0 .../context => nvisy-context/src}/enhancer.rs | 44 ++++++++------ crates/nvisy-context/src/lib.rs | 15 +++++ .../context => nvisy-context/src}/matcher.rs | 0 .../context => nvisy-context/src}/registry.rs | 11 ++++ .../context => nvisy-context/src}/tokens.rs | 4 +- crates/nvisy-core/src/context/mod.rs | 41 ------------- crates/nvisy-core/src/lib.rs | 1 - crates/nvisy-engine/Cargo.toml | 1 + crates/nvisy-engine/src/core/context.rs | 17 ++++++ .../nvisy-engine/src/detection/config/mod.rs | 59 ++++++++++++++----- crates/nvisy-engine/src/detection/document.rs | 5 +- crates/nvisy-engine/src/detection/mod.rs | 4 +- .../src/detection/phases/detection.rs | 28 ++++++--- crates/nvisy-engine/src/detection/pipeline.rs | 5 +- crates/nvisy-ner/Cargo.toml | 1 + crates/nvisy-ner/src/nlp/engine.rs | 4 +- crates/nvisy-ner/src/nlp/mod.rs | 4 +- crates/nvisy-ner/src/recognition/config.rs | 2 +- .../nvisy-ner/src/recognition/recognizer.rs | 21 ++++++- crates/nvisy-pattern/Cargo.toml | 1 + .../src/recognition/dictionary.rs | 2 +- crates/nvisy-pattern/src/recognition/mod.rs | 4 +- .../src/recognition/recognizer.rs | 2 +- .../src/recognition/regex_rule.rs | 2 +- .../nvisy-pattern/src/recognition/registry.rs | 6 +- .../nvisy-pattern/tests/enhancer_roundtrip.rs | 5 +- 31 files changed, 287 insertions(+), 104 deletions(-) create mode 100644 crates/nvisy-context/Cargo.toml create mode 100644 crates/nvisy-context/README.md rename crates/{nvisy-core/src/context => nvisy-context/src}/declaration.rs (100%) rename crates/{nvisy-core/src/context => nvisy-context/src}/enhancer.rs (92%) create mode 100644 crates/nvisy-context/src/lib.rs rename crates/{nvisy-core/src/context => nvisy-context/src}/matcher.rs (100%) rename crates/{nvisy-core/src/context => nvisy-context/src}/registry.rs (89%) rename crates/{nvisy-core/src/context => nvisy-context/src}/tokens.rs (98%) delete mode 100644 crates/nvisy-core/src/context/mod.rs diff --git a/Cargo.lock b/Cargo.lock index d5056aea..a9b28db5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2891,6 +2891,18 @@ dependencies = [ "uuid", ] +[[package]] +name = "nvisy-context" +version = "0.1.0" +dependencies = [ + "derive_builder", + "hipstr", + "nvisy-core", + "schemars", + "serde", + "thiserror", +] + [[package]] name = "nvisy-core" version = "0.1.0" @@ -2928,6 +2940,7 @@ dependencies = [ "humantime-serde", "jiff", "nvisy-codec", + "nvisy-context", "nvisy-core", "nvisy-engine", "nvisy-llm", @@ -2999,6 +3012,7 @@ dependencies = [ "bentoml", "derive_builder", "lingua", + "nvisy-context", "nvisy-core", "serde", "tokio", @@ -3030,6 +3044,7 @@ dependencies = [ "async-trait", "csv", "derive_builder", + "nvisy-context", "nvisy-core", "regex", "schemars", diff --git a/Cargo.toml b/Cargo.toml index 9a236f0f..0c74b1d1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,6 +5,7 @@ resolver = "3" members = [ "./crates/nvisy-cli", "./crates/nvisy-codec", + "./crates/nvisy-context", "./crates/nvisy-core", "./crates/nvisy-engine", "./crates/nvisy-fake", @@ -37,6 +38,7 @@ documentation = "https://docs.rs/nvisy-runtime" # Internal crates nvisy-codec = { path = "./crates/nvisy-codec", version = "0.1.0", default-features = false } +nvisy-context = { path = "./crates/nvisy-context", version = "0.1.0" } nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } nvisy-fake = { path = "./crates/nvisy-fake", version = "0.1.0" } diff --git a/crates/nvisy-context/Cargo.toml b/crates/nvisy-context/Cargo.toml new file mode 100644 index 00000000..8c53f2d1 --- /dev/null +++ b/crates/nvisy-context/Cargo.toml @@ -0,0 +1,41 @@ +# https://doc.rust-lang.org/cargo/reference/manifest.html + +[package] +name = "nvisy-context" +description = "Post-recognition keyword-boost enhancer for Nvisy entities" +keywords = ["nvisy", "context", "enhancer", "pii"] +categories = ["text-processing"] +readme = "README.md" + +version = { workspace = true } +rust-version = { workspace = true } +edition = { workspace = true } +license = { workspace = true } +publish = { workspace = true } + +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } + +[package.metadata.docs.rs] +all-features = true +rustdoc-args = ["--cfg", "docsrs"] + +[dependencies] +# Internal crates +nvisy-core = { workspace = true, features = [] } + +# Serialization +serde = { workspace = true, features = [] } +schemars = { workspace = true, features = [] } + +# Derive macros and error handling +derive_builder = { workspace = true, features = [] } +thiserror = { workspace = true, features = [] } + +# Primitive datatypes (cheap-clone surface form on `Token`) +hipstr = { workspace = true, features = [] } + +[dev-dependencies] +nvisy-core = { workspace = true, features = ["test-utils"] } diff --git a/crates/nvisy-context/README.md b/crates/nvisy-context/README.md new file mode 100644 index 00000000..e8653b7a --- /dev/null +++ b/crates/nvisy-context/README.md @@ -0,0 +1,44 @@ +# nvisy-context + +[![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) + +Post-recognition keyword-boost enhancer for the Nvisy runtime. + +## Overview + +Mirrors Presidio's `ContextAwareEnhancer` pattern. Every recognizer +that wants score boosting declares a `Context` (a list of keywords +plus optional window / boost overrides), registered against the +recognizer's name. After recognition, `ContextEnhancer` walks each +detected `Entity`, looks the recognizer name up in the +`ContextRegistry`, scans the surrounding window for any declared +keyword via the configured `KeywordMatcher`, and bumps the entity's +confidence on a hit. + +`Tokens` is the optional NLP artifact (surface + lemma per token) +that a tokenizing NLP engine stashes on `RecognizerInput.artifacts` +so `LemmaMatcher` can match morphological variants (`running` → +`run`). The `SubstringMatcher` fallback runs whenever no `Tokens` +artifact is present. + +The crate depends only on `nvisy-core` for `Entity`, +`TrailStep`, and `Confidence` — recognizer crates and the engine +each depend on `nvisy-context` to participate. + +## Documentation + +See [`docs/`](../../docs/) for architecture, security, and API documentation. + +## Changelog + +See [CHANGELOG.md](../../CHANGELOG.md) for release notes and version history. + +## License + +Apache 2.0 License, see [LICENSE.txt](../../LICENSE.txt) + +## Support + +- **Documentation**: [docs.nvisy.com](https://docs.nvisy.com) +- **Issues**: [GitHub Issues](https://github.com/nvisycom/runtime/issues) +- **Email**: [support@nvisy.com](mailto:support@nvisy.com) diff --git a/crates/nvisy-core/src/context/declaration.rs b/crates/nvisy-context/src/declaration.rs similarity index 100% rename from crates/nvisy-core/src/context/declaration.rs rename to crates/nvisy-context/src/declaration.rs diff --git a/crates/nvisy-core/src/context/enhancer.rs b/crates/nvisy-context/src/enhancer.rs similarity index 92% rename from crates/nvisy-core/src/context/enhancer.rs rename to crates/nvisy-context/src/enhancer.rs index 38ff3794..ab406d2d 100644 --- a/crates/nvisy-core/src/context/enhancer.rs +++ b/crates/nvisy-context/src/enhancer.rs @@ -2,14 +2,14 @@ //! any [`Entity`] regardless of which recognizer produced it. use derive_builder::{Builder, UninitializedFieldError}; -use type_map::concurrent::TypeMap; +use nvisy_core::entity::{Entity, TrailStep}; +use nvisy_core::extraction::Artifacts; +use nvisy_core::modality::Text; +use nvisy_core::primitive::Confidence; use super::Tokens; use super::matcher::{KeywordMatcher, SubstringMatcher}; use super::registry::ContextRegistry; -use crate::entity::{Entity, TrailStep}; -use crate::modality::Text; -use crate::primitive::Confidence; /// Post-recognition enhancer that boosts entity confidence when /// keywords declared by the source recognizer appear near the match. @@ -81,6 +81,14 @@ impl ContextEnhancer { ContextEnhancerBuilder::default() } + /// Borrow the underlying registry. Useful for diagnostics and + /// for engine code that wants to short-circuit when there are + /// no entries to boost against. + #[must_use] + pub fn registry(&self) -> &ContextRegistry { + &self.registry + } + /// Apply context-keyword boosting to `entities` in place. /// /// For each entity, looks at its first recognition step's @@ -96,14 +104,14 @@ impl ContextEnhancer { /// declared context has an empty keyword list) pass through /// unchanged. /// - /// [`Refinement`]: crate::entity::TrailStepKind::Refinement - pub fn enhance(&self, entities: &mut [Entity], text: &str, artifacts: &TypeMap) { + /// [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement + pub fn enhance(&self, entities: &mut [Entity], text: &str, artifacts: &Artifacts) { for entity in entities.iter_mut() { self.enhance_one(entity, text, artifacts); } } - fn enhance_one(&self, entity: &mut Entity, text: &str, artifacts: &TypeMap) { + fn enhance_one(&self, entity: &mut Entity, text: &str, artifacts: &Artifacts) { let Some(name) = entity .trail .first() @@ -216,15 +224,15 @@ impl From for ContextEnhancerBuilderError { #[cfg(test)] mod tests { - use type_map::concurrent::TypeMap; - - use super::*; - use crate::context::Context; - use crate::entity::{ + use nvisy_core::entity::{ EntityLabelRef, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, builtins, }; - use crate::modality::{Text, TextLocation}; + use nvisy_core::extraction::Artifacts; + use nvisy_core::modality::{Text, TextLocation}; + + use super::*; + use crate::Context; fn pattern_entity(name: &str, span: std::ops::Range) -> Entity { let confidence = Confidence::new(0.6).unwrap(); @@ -284,7 +292,7 @@ mod tests { let text = "Your SSN: 123-45-6789"; let mut entities = vec![pattern_entity("ssn", 10..21)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); assert!(entities[0].confidence.get() > before); assert!( entities[0] @@ -308,7 +316,7 @@ mod tests { let text = "Mr. Smith is named in the report."; let mut entities = vec![model_entity("gliner", 4..9)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); assert!(entities[0].confidence.get() > before); let TrailProvenance::Model(prov) = &entities[0].trail[0].provenance else { panic!("expected model provenance"); @@ -323,7 +331,7 @@ mod tests { let text = "Your SSN: 123-45-6789"; let mut entities = vec![pattern_entity("ssn", 10..21)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); assert_eq!(entities[0].confidence.get(), before); } @@ -335,7 +343,7 @@ mod tests { let text = "far_keyword XYZ here"; let mut entities = vec![pattern_entity("far", 39..42)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); assert_eq!(entities[0].confidence.get(), before); } @@ -349,7 +357,7 @@ mod tests { // Push base confidence to 0.95 entity.confidence = Confidence::new(0.95).unwrap(); let mut entities = vec![entity]; - enhancer.enhance(&mut entities, text, &TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); assert!((entities[0].confidence.get() - 1.0).abs() < f64::EPSILON); } } diff --git a/crates/nvisy-context/src/lib.rs b/crates/nvisy-context/src/lib.rs new file mode 100644 index 00000000..2004d7c6 --- /dev/null +++ b/crates/nvisy-context/src/lib.rs @@ -0,0 +1,15 @@ +#![forbid(unsafe_code)] +#![cfg_attr(docsrs, feature(doc_cfg))] +#![doc = include_str!("../README.md")] + +mod declaration; +mod enhancer; +mod matcher; +mod registry; +mod tokens; + +pub use self::declaration::Context; +pub use self::enhancer::{ContextEnhancer, ContextEnhancerBuilder, ContextEnhancerBuilderError}; +pub use self::matcher::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; +pub use self::registry::ContextRegistry; +pub use self::tokens::{Token, Tokens}; diff --git a/crates/nvisy-core/src/context/matcher.rs b/crates/nvisy-context/src/matcher.rs similarity index 100% rename from crates/nvisy-core/src/context/matcher.rs rename to crates/nvisy-context/src/matcher.rs diff --git a/crates/nvisy-core/src/context/registry.rs b/crates/nvisy-context/src/registry.rs similarity index 89% rename from crates/nvisy-core/src/context/registry.rs rename to crates/nvisy-context/src/registry.rs index ef301338..d0043bc9 100644 --- a/crates/nvisy-core/src/context/registry.rs +++ b/crates/nvisy-context/src/registry.rs @@ -68,6 +68,17 @@ impl ContextRegistry { self } + /// Merge another registry into this one. Last-write-wins on + /// duplicate names. Used to combine per-source registries (e.g. + /// pattern registry + NER registry) into one enhancer input. + #[must_use] + pub fn merge(mut self, other: ContextRegistry) -> Self { + for (name, context) in other.entries { + self.entries.insert(name, context); + } + self + } + /// Look up the [`Context`] for `name`. Returns `None` when the /// name was never registered or when the registered context /// had an empty keyword list (which is treated as "not diff --git a/crates/nvisy-core/src/context/tokens.rs b/crates/nvisy-context/src/tokens.rs similarity index 98% rename from crates/nvisy-core/src/context/tokens.rs rename to crates/nvisy-context/src/tokens.rs index 55cb21e0..24181797 100644 --- a/crates/nvisy-core/src/context/tokens.rs +++ b/crates/nvisy-context/src/tokens.rs @@ -15,7 +15,7 @@ //! //! [`around`]: Tokens::around //! [`lemmas_in`]: Tokens::lemmas_in -//! [`Entity::location`]: crate::entity::Entity::location +//! [`Entity::location`]: nvisy_core::entity::Entity::location //! //! Tokens live next to the [`ContextEnhancer`] because that's the //! only consumer: the enhancer reads them off @@ -102,7 +102,7 @@ impl Token { /// The owning token sequence carried by a /// [`RecognizerInput::artifacts`] bundle. /// -/// [`RecognizerInput::artifacts`]: crate::recognition::RecognizerInput::artifacts +/// [`RecognizerInput::artifacts`]: nvisy_core::recognition::RecognizerInput::artifacts /// /// Tokens are sorted by `offset.start` (producers should emit them /// in order; consumer-side code assumes this). The collection diff --git a/crates/nvisy-core/src/context/mod.rs b/crates/nvisy-core/src/context/mod.rs deleted file mode 100644 index bab43af9..00000000 --- a/crates/nvisy-core/src/context/mod.rs +++ /dev/null @@ -1,41 +0,0 @@ -//! Post-recognition keyword-boost enhancement, shared across every -//! [`EntityRecognizer`]. -//! -//! The enhancer takes a slice of detected entities plus the source -//! text and the shared `RecognizerInput::artifacts` `TypeMap`, and for each -//! entity: -//! -//! 1. Pulls the source recognizer's name from the entity's first -//! `TrailStep` provenance. -//! 2. Looks the name up in a [`ContextRegistry`] to find the -//! declared keyword [`Context`]. -//! 3. Walks the surrounding window (token-based when `Tokens` is -//! present in the artifact map, substring-based otherwise) and -//! asks the configured [`KeywordMatcher`] whether any keyword -//! fired. -//! 4. Applies the configured boost (or the per-entity override), -//! capped at `1.0`, and appends a `Refinement` step to the -//! trail. -//! -//! The registry shape — `name → Context` — is the same pattern -//! Presidio uses: each recognizer (or each rule within a -//! recognizer) registers a *source name* and a keyword list, and -//! the enhancer dispatches on the name carried in the entity's -//! provenance. Per-rule contexts for patterns (`Regex.context`, -//! `Dictionary.context` in `nvisy-pattern`) and per-recognizer -//! contexts for NER (`NerRecognizer.default_context` in -//! `nvisy-ner`) plug into the same registry. -//! -//! [`EntityRecognizer`]: crate::recognition::EntityRecognizer - -mod declaration; -mod enhancer; -mod matcher; -mod registry; -mod tokens; - -pub use self::declaration::Context; -pub use self::enhancer::{ContextEnhancer, ContextEnhancerBuilder, ContextEnhancerBuilderError}; -pub use self::matcher::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; -pub use self::registry::ContextRegistry; -pub use self::tokens::{Token, Tokens}; diff --git a/crates/nvisy-core/src/lib.rs b/crates/nvisy-core/src/lib.rs index f32ebaf5..ea75f4cc 100644 --- a/crates/nvisy-core/src/lib.rs +++ b/crates/nvisy-core/src/lib.rs @@ -2,7 +2,6 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -pub mod context; pub mod entity; pub mod extraction; pub mod health; diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index ff30c194..5f516ed7 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -39,6 +39,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates +nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } nvisy-toolkit = { workspace = true, features = [] } nvisy-codec = { workspace = true, features = ["text"] } diff --git a/crates/nvisy-engine/src/core/context.rs b/crates/nvisy-engine/src/core/context.rs index 3dfdbccd..865e8102 100644 --- a/crates/nvisy-engine/src/core/context.rs +++ b/crates/nvisy-engine/src/core/context.rs @@ -17,6 +17,7 @@ use std::num::NonZeroUsize; use std::sync::Arc; +use nvisy_context::ContextEnhancer; use nvisy_toolkit::detection::RecognizerRegistry; use nvisy_toolkit::extraction::ExtractorRegistry; use tokio_util::sync::CancellationToken; @@ -51,6 +52,11 @@ pub struct DetectionContext { /// engine-side detection-config template plus the request's /// label catalog. pub(crate) recognizer_registry: Arc, + /// Post-recognition keyword-boost enhancer — built alongside + /// `recognizer_registry` from the same recognizer set. Shared + /// behind `Arc` so per-document phases borrow it without + /// cloning the embedded registry / matcher. + pub(crate) context_enhancer: Arc, pub(crate) concurrency: Option, } @@ -61,6 +67,7 @@ pub struct DetectionContext { pub(crate) struct DetectionEngines { pub extraction_engine: ExtractorRegistry, pub recognizer_registry: Arc, + pub context_enhancer: Arc, } impl DetectionContext { @@ -75,12 +82,14 @@ impl DetectionContext { let DetectionEngines { extraction_engine, recognizer_registry, + context_enhancer, } = engines; Self { cancel, shared, extraction_engine, recognizer_registry, + context_enhancer, concurrency, } } @@ -99,6 +108,14 @@ impl DetectionContext { pub(crate) fn recognizer_registry(&self) -> &Arc { &self.recognizer_registry } + + /// Per-request context-keyword enhancer borrowed by + /// [`DetectionPhase`]. + /// + /// [`DetectionPhase`]: crate::detection::phases::detection::DetectionPhase + pub(crate) fn context_enhancer(&self) -> &Arc { + &self.context_enhancer + } } impl PhaseContext for DetectionContext { diff --git a/crates/nvisy-engine/src/detection/config/mod.rs b/crates/nvisy-engine/src/detection/config/mod.rs index a5288936..f9c77164 100644 --- a/crates/nvisy-engine/src/detection/config/mod.rs +++ b/crates/nvisy-engine/src/detection/config/mod.rs @@ -17,6 +17,7 @@ mod pattern; #[cfg(not(feature = "bento"))] use nvisy_core::Error; +use nvisy_context::{ContextEnhancer, ContextRegistry}; use nvisy_core::Result; use nvisy_core::entity::EntityLabelCatalog; use nvisy_core::modality::Text; @@ -34,6 +35,26 @@ pub use self::pattern::PatternDetection; /// provenance on emitted entities). const NER_RECOGNIZER_NAME: &str = "ner"; +/// Engine-wide defaults for the post-recognition [`ContextEnhancer`]. +/// Mirrors Presidio's defaults (`context_similarity_factor = 0.35`, +/// `context_prefix_count = ~5 words ≈ 50 bytes`). +const ENHANCER_DEFAULT_WINDOW: usize = 50; +const ENHANCER_DEFAULT_BOOST: f64 = 0.35; + +/// Bundle returned by [`DetectionConfig::build_for_request`]: +/// the per-request recognizer registry plus the matching +/// [`ContextEnhancer`] built from each recognizer's declared +/// context keywords. +pub struct DetectionResources { + /// Recognizers selected for this request. + pub recognizers: RecognizerRegistry, + /// Post-recognition keyword-boost enhancer for `Text` + /// entities. Always present; carries an empty registry when + /// no recognizer declared context keywords (cheap to skip + /// inside [`ContextEnhancer::enhance`]). + pub enhancer: ContextEnhancer, +} + /// Configuration for the [`RecognizerRegistry`]. /// /// Each field maps to a `[detection.*]` section in `Nvisy.toml`. @@ -71,13 +92,15 @@ impl DetectionConfig { /// Returns the first construction error encountered — pattern /// compile failure, NER backend init failure, or a /// config-selected backend whose feature wasn't compiled in. - pub fn build_for_request(&self, catalog: &EntityLabelCatalog) -> Result { + pub fn build_for_request(&self, catalog: &EntityLabelCatalog) -> Result { let mut reg = RecognizerRegistry::new(); + let mut context_registry = ContextRegistry::new(); let pattern_cfg = self.pattern.clone().unwrap_or_default(); if pattern_cfg.enabled { let pattern_registry = PatternRegistry::builtin().filter_by_catalog(catalog); if !pattern_registry.is_empty() { + context_registry = context_registry.merge(pattern_registry.context_registry()); let recognizer = PatternRecognizer::builder() .with_registry(pattern_registry) .build()?; @@ -87,25 +110,21 @@ impl DetectionConfig { if let Some(ner_cfg) = self.ner.as_ref().filter(|c| c.enabled) { let supported_labels = catalog.iter().map(|l| l.label_ref()).collect::>(); - reg = match &ner_cfg.backend { - NerBackend::Noop => { - let recognizer = NerRecognizer::builder() - .with_name(NER_RECOGNIZER_NAME) - .with_engine(NoopBackend) - .with_supported_labels(supported_labels) - .build()?; - reg.with_recognizer::(recognizer) - } + let recognizer = match &ner_cfg.backend { + NerBackend::Noop => NerRecognizer::builder() + .with_name(NER_RECOGNIZER_NAME) + .with_engine(NoopBackend) + .with_supported_labels(supported_labels) + .build()?, #[cfg(feature = "bento")] NerBackend::Bento { base_url } => { let backend = BentoBackend::new(BentoParams::new(base_url.clone()))?; - let recognizer = NerRecognizer::builder() + NerRecognizer::builder() .with_name(NER_RECOGNIZER_NAME) .with_engine(backend) .with_supported_labels(supported_labels) - .build()?; - reg.with_recognizer::(recognizer) + .build()? } #[cfg(not(feature = "bento"))] @@ -116,8 +135,20 @@ impl DetectionConfig { )); } }; + context_registry = context_registry.merge(recognizer.context_registry()); + reg = reg.with_recognizer::(recognizer); } - Ok(reg) + let enhancer = ContextEnhancer::builder() + .with_registry(context_registry) + .with_default_window(ENHANCER_DEFAULT_WINDOW) + .with_default_boost(ENHANCER_DEFAULT_BOOST) + .build() + .expect("enhancer fields (window, boost, registry) all set"); + + Ok(DetectionResources { + recognizers: reg, + enhancer, + }) } } diff --git a/crates/nvisy-engine/src/detection/document.rs b/crates/nvisy-engine/src/detection/document.rs index 379420e0..378906b8 100644 --- a/crates/nvisy-engine/src/detection/document.rs +++ b/crates/nvisy-engine/src/detection/document.rs @@ -27,7 +27,10 @@ impl DetectionDocumentPipeline { pub(super) fn from_context(ctx: &DetectionContext) -> Self { Self { extraction: ExtractionPhase::new(ctx.extraction_engine().clone()), - detection: DetectionPhase::new(ctx.recognizer_registry().clone()), + detection: DetectionPhase::new( + ctx.recognizer_registry().clone(), + ctx.context_enhancer().clone(), + ), deduplication: DeduplicationPhase::new(), } } diff --git a/crates/nvisy-engine/src/detection/mod.rs b/crates/nvisy-engine/src/detection/mod.rs index 31e50884..b0c2e6d0 100644 --- a/crates/nvisy-engine/src/detection/mod.rs +++ b/crates/nvisy-engine/src/detection/mod.rs @@ -28,7 +28,9 @@ mod result; mod state; mod status; -pub use self::config::{DetectionConfig, NerBackend, NerDetection, PatternDetection}; +pub use self::config::{ + DetectionConfig, DetectionResources, NerBackend, NerDetection, PatternDetection, +}; pub use self::engine::DetectionEngine; pub use self::extraction::ExtractionConfig; #[cfg(feature = "image")] diff --git a/crates/nvisy-engine/src/detection/phases/detection.rs b/crates/nvisy-engine/src/detection/phases/detection.rs index b86bdb8c..ef60bf51 100644 --- a/crates/nvisy-engine/src/detection/phases/detection.rs +++ b/crates/nvisy-engine/src/detection/phases/detection.rs @@ -11,8 +11,10 @@ use std::sync::Arc; +use nvisy_context::ContextEnhancer; use nvisy_core::Result; use nvisy_core::entity::Entity; +use nvisy_core::extraction::Artifacts; use nvisy_core::modality::{ Audio, AudioLocation, Image, ImageLocation, Overlap, Tabular, TabularLocation, Text, TextData, TextLocation, @@ -34,18 +36,21 @@ const TARGET: &str = "nvisy_engine::detection"; /// /// Holds an `Arc` so the registry is shared /// cheaply across per-document phases without cloning the -/// underlying recognizer lists. +/// underlying recognizer lists, plus an `Arc` for +/// the post-recognition keyword-boost pass. /// /// [`EntityRecord`]: crate::document::provenance::EntityRecord pub struct DetectionPhase { registry: Arc, + enhancer: Arc, } impl DetectionPhase { - /// Build the phase from the shared recognizer registry. Called - /// once per pipeline by the pipeline orchestrator. - pub fn new(registry: Arc) -> Self { - Self { registry } + /// Build the phase from the shared recognizer registry and + /// matching context enhancer. Called once per pipeline by the + /// pipeline orchestrator. + pub fn new(registry: Arc, enhancer: Arc) -> Self { + Self { registry, enhancer } } pub(crate) async fn apply_text( @@ -84,7 +89,7 @@ impl DetectionPhase { let span = tracing::info_span!(target: TARGET, "phase", name = "detection.image"); let run_id = ctx.shared().run_id; async move { - detect_text_blocks(&self.registry, &mut tree.root, run_id).await?; + detect_text_blocks(&self.registry, &self.enhancer, &mut tree.root, run_id).await?; detect_image_chunks( &self.registry, &mut tree.root, @@ -111,7 +116,7 @@ impl DetectionPhase { let span = tracing::info_span!(target: TARGET, "phase", name = "detection.text_only"); let run_id = ctx.shared().run_id; async move { - detect_text_blocks(&self.registry, doc, run_id).await?; + detect_text_blocks(&self.registry, &self.enhancer, doc, run_id).await?; Ok(()) } .instrument(span) @@ -123,6 +128,7 @@ impl DetectionPhase { /// text via [`ModalityBlock::scan_text`] (today: every modality). async fn detect_text_blocks( registry: &RecognizerRegistry, + enhancer: &ContextEnhancer, doc: &mut Document, run_id: uuid::Uuid, ) -> Result<()> @@ -149,7 +155,13 @@ where let mut input = RecognizerInput::new(TextData::new(text.to_owned())); input.correlation_id = Some(run_id); - let detected = registry.run::(input).await?; + let mut detected = registry.run::(input).await?; + // Apply context-keyword boosting in block-local coordinates, + // before lifting to modality-absolute locations. The shared + // NLP-pass producer hasn't been wired into the detection + // pipeline yet, so we pass an empty `Artifacts` — the + // enhancer's substring path runs without it. + enhancer.enhance(&mut detected, text, &Artifacts::new()); for entity in detected { let Some(location) = M::lift_from_block(&block.spans, entity.location.start, entity.location.end) diff --git a/crates/nvisy-engine/src/detection/pipeline.rs b/crates/nvisy-engine/src/detection/pipeline.rs index 0e6349b8..52b7fcff 100644 --- a/crates/nvisy-engine/src/detection/pipeline.rs +++ b/crates/nvisy-engine/src/detection/pipeline.rs @@ -134,12 +134,12 @@ impl DetectionPipeline { ) -> Result<(Vec, u64, DetectionStatus), Error> { let actor_id = prepared.actor_id; - let recognizer_registry = match self + let (recognizer_registry, context_enhancer) = match self .state .detection_config .build_for_request(&prepared.catalog) { - Ok(r) => Arc::new(r), + Ok(r) => (Arc::new(r.recognizers), Arc::new(r.enhancer)), Err(e) => { self.detections.fail(self.detection_id, e.to_string()).await; return Err(e); @@ -163,6 +163,7 @@ impl DetectionPipeline { let engines = DetectionEngines { extraction_engine: (*self.state.extraction_engine).clone(), recognizer_registry, + context_enhancer, }; let concurrency = self.base_config.effective_concurrency(); let ctx = DetectionContext::new(cancel, Arc::new(shared_data), engines, concurrency); diff --git a/crates/nvisy-ner/Cargo.toml b/crates/nvisy-ner/Cargo.toml index c802df26..bad8fe13 100644 --- a/crates/nvisy-ner/Cargo.toml +++ b/crates/nvisy-ner/Cargo.toml @@ -32,6 +32,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates +nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } # Async trait sugar diff --git a/crates/nvisy-ner/src/nlp/engine.rs b/crates/nvisy-ner/src/nlp/engine.rs index 3924c3cc..2ffa1779 100644 --- a/crates/nvisy-ner/src/nlp/engine.rs +++ b/crates/nvisy-ner/src/nlp/engine.rs @@ -9,7 +9,7 @@ //! [`RecognizerInput::with_artifacts`]. //! //! [`LanguageDetections`]: nvisy_core::primitive::LanguageDetections -//! [`Tokens`]: nvisy_core::context::Tokens +//! [`Tokens`]: nvisy_context::Tokens //! [`Artifacts`]: nvisy_core::extraction::Artifacts //! [`RecognizerInput`]: nvisy_core::recognition::RecognizerInput //! [`RecognizerInput::with_artifacts`]: nvisy_core::recognition::RecognizerInput::with_artifacts @@ -20,7 +20,7 @@ //! `process_batch`) once per scan; recognizers and the //! [`ContextEnhancer`] borrow the resulting map by reference. //! -//! [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer +//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer use nvisy_core::Result; use nvisy_core::primitive::LanguageTag; diff --git a/crates/nvisy-ner/src/nlp/mod.rs b/crates/nvisy-ner/src/nlp/mod.rs index 2f89fb8a..77fef86d 100644 --- a/crates/nvisy-ner/src/nlp/mod.rs +++ b/crates/nvisy-ner/src/nlp/mod.rs @@ -21,9 +21,9 @@ //! The trait is async because realistic implementations are //! HTTP-bound or otherwise yield. //! -//! [`Tokens`]: nvisy_core::context::Tokens +//! [`Tokens`]: nvisy_context::Tokens //! [`LanguageDetections`]: nvisy_core::primitive::LanguageDetections -//! [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer +//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer //! [`lingua`]: https://crates.io/crates/lingua //! [`NerBackend`]: crate::backend::NerBackend //! [`NerRecognizer`]: crate::NerRecognizer diff --git a/crates/nvisy-ner/src/recognition/config.rs b/crates/nvisy-ner/src/recognition/config.rs index f96b5888..a50c7b6f 100644 --- a/crates/nvisy-ner/src/recognition/config.rs +++ b/crates/nvisy-ner/src/recognition/config.rs @@ -62,7 +62,7 @@ pub struct NerModel { /// recognizer's [`name`] is used /// as the registration key. /// - /// [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer + /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer /// [`name`]: super::NerRecognizer::name pub default_context: Vec, } diff --git a/crates/nvisy-ner/src/recognition/recognizer.rs b/crates/nvisy-ner/src/recognition/recognizer.rs index 210b464b..bbdca67b 100644 --- a/crates/nvisy-ner/src/recognition/recognizer.rs +++ b/crates/nvisy-ner/src/recognition/recognizer.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use derive_builder::Builder; +use nvisy_context::{Context, ContextRegistry}; use nvisy_core::entity::{Entity, EntityLabelRef, ModelProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Text, TextLocation}; use nvisy_core::primitive::Confidence; @@ -40,7 +41,7 @@ pub struct NerRecognizer { /// the key the [`ContextEnhancer`] looks up to find the /// recognizer's [`default_context`]. /// - /// [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer + /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer /// [`default_context`]: NerModel::default_context name: String, /// Backend that turns `(text, kinds)` into raw spans. Required. @@ -91,6 +92,24 @@ impl NerRecognizer { &self.model } + /// Build a [`ContextRegistry`] containing this recognizer's + /// [`default_context`] keyed on the recognizer's name. Returns + /// an empty registry when no keywords were declared. + /// + /// Mirrors `PatternRegistry::context_registry` so engine code + /// can merge per-recognizer contexts from every text-modality + /// recognizer into one enhancer input without duplicating the + /// keyword data. + /// + /// [`default_context`]: NerModel::default_context + #[must_use] + pub fn context_registry(&self) -> ContextRegistry { + ContextRegistry::new().with_entry( + self.name.clone(), + Context::new(self.model.default_context.iter().cloned()), + ) + } + fn build_entity(&self, span: &RawNerSpan, label: EntityLabelRef) -> Entity { let raw_confidence = Confidence::try_clamped(span.score).unwrap_or(self.model.default_score); diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index ffe5dc8d..574fadf8 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -24,6 +24,7 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates +nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } # (De)serialization diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index 856b707f..7875c2a2 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -23,8 +23,8 @@ //! [`with_terms`]: DictionaryBuilder::with_terms use derive_builder::Builder; +use nvisy_context::Context; use nvisy_core::Error; -use nvisy_core::context::Context; use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use schemars::JsonSchema; diff --git a/crates/nvisy-pattern/src/recognition/mod.rs b/crates/nvisy-pattern/src/recognition/mod.rs index 52876335..0ce29c61 100644 --- a/crates/nvisy-pattern/src/recognition/mod.rs +++ b/crates/nvisy-pattern/src/recognition/mod.rs @@ -1,10 +1,10 @@ //! Recognition primitives — the rule shapes ([`Regex`], //! [`Dictionary`]), their building blocks ([`Terms`] plus -//! [`Context`] from `nvisy-core`), +//! [`Context`] from `nvisy-context`), //! the [`PatternRegistry`] that bundles them, and the runtime //! [`PatternRecognizer`] that compiles them into pooled scanners. //! -//! [`Context`]: nvisy_core::context::Context +//! [`Context`]: nvisy_context::Context mod dictionary; mod recognizer; diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index 42fee424..ce987ee3 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -28,7 +28,7 @@ use crate::validators::{Validator, ValidatorRegistry}; /// the recognizer never reads it; the [`ContextEnhancer`] looks it /// up directly on the [`PatternRegistry`] at boost time. /// -/// [`ContextEnhancer`]: crate::ContextEnhancer +/// [`ContextEnhancer`]: nvisy_context::ContextEnhancer struct CompiledPattern { name: String, label: EntityLabelRef, diff --git a/crates/nvisy-pattern/src/recognition/regex_rule.rs b/crates/nvisy-pattern/src/recognition/regex_rule.rs index 5cfec944..55f303ca 100644 --- a/crates/nvisy-pattern/src/recognition/regex_rule.rs +++ b/crates/nvisy-pattern/src/recognition/regex_rule.rs @@ -10,8 +10,8 @@ //! [`Regex::from_toml`] when loading a definition file. use derive_builder::Builder; +use nvisy_context::Context; use nvisy_core::Error; -use nvisy_core::context::Context; use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use schemars::JsonSchema; diff --git a/crates/nvisy-pattern/src/recognition/registry.rs b/crates/nvisy-pattern/src/recognition/registry.rs index 6b675707..c763661a 100644 --- a/crates/nvisy-pattern/src/recognition/registry.rs +++ b/crates/nvisy-pattern/src/recognition/registry.rs @@ -10,9 +10,9 @@ //! [`Regex`] / [`Dictionary`] storage between the two consumers. //! //! [`PatternRecognizer`]: super::PatternRecognizer -//! [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer +//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer -use nvisy_core::context::ContextRegistry; +use nvisy_context::ContextRegistry; use nvisy_core::entity::EntityLabelCatalog; use super::dictionary::Dictionary; @@ -134,7 +134,7 @@ impl PatternRegistry { /// from — no duplication of keyword data between rule /// registration and enhancer construction. /// - /// [`ContextEnhancer`]: nvisy_core::context::ContextEnhancer + /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer #[must_use] pub fn context_registry(&self) -> ContextRegistry { let mut registry = ContextRegistry::new(); diff --git a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs index 6bb637f6..dbc09cbc 100644 --- a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs +++ b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs @@ -6,8 +6,9 @@ //! //! [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement -use nvisy_core::context::{Context, ContextEnhancer}; +use nvisy_context::{Context, ContextEnhancer}; use nvisy_core::entity::{PatternProvenance, TrailProvenance, TrailStepKind, builtins}; +use nvisy_core::extraction::Artifacts; use nvisy_core::modality::TextData; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; @@ -56,7 +57,7 @@ async fn enhancer_boosts_matches_near_keyword_only() { .with_default_boost(0.3) .build() .expect("enhancer builds"); - enhancer.enhance(&mut entities, text, &type_map::concurrent::TypeMap::new()); + enhancer.enhance(&mut entities, text, &Artifacts::new()); // First match has `SSN:` within the 20-byte window → boosted. let near = entities From 534c1c7373384f95c6d17f50efad08dda2ad8aaf Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 14:04:10 +0200 Subject: [PATCH 02/14] chore(deps): clean up workspace deps + normalize per-crate manifests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops 9 unused workspace deps (hmac, include_dir, quick-xml, reqwest, serde_with, smallvec, stop-words, walkdir, zip) and reorders the root [workspace.dependencies] foundation-first: primitives → runtime → domain (text/document/image/audio) → integration (HTTP, AI, server, CLI) → storage → utilities. Removes per-crate machete-flagged deps and aligns every crate manifest with the new group names and order. Keeps calamine and unicode-segmentation in workspace deps for upcoming xlsx + word-boundary work. Marks humantime-serde as ignored in nvisy-llm/nvisy-engine/nvisy-server where it's used via serde `with =` strings. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 11 ----- Cargo.toml | 83 ++++++++++++++------------------- crates/nvisy-cli/Cargo.toml | 16 ++++--- crates/nvisy-codec/Cargo.toml | 52 +++++++++++---------- crates/nvisy-core/Cargo.toml | 21 ++++----- crates/nvisy-engine/Cargo.toml | 61 ++++++++++-------------- crates/nvisy-fake/Cargo.toml | 10 ++-- crates/nvisy-llm/Cargo.toml | 42 +++++++++-------- crates/nvisy-ner/Cargo.toml | 26 +++++------ crates/nvisy-ocr/Cargo.toml | 19 ++++---- crates/nvisy-pattern/Cargo.toml | 17 +++---- crates/nvisy-server/Cargo.toml | 26 +++++------ crates/nvisy-stt/Cargo.toml | 6 +-- crates/nvisy-toolkit/Cargo.toml | 26 +++++------ 14 files changed, 185 insertions(+), 231 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a9b28db5..bfed20b0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2888,7 +2888,6 @@ dependencies = [ "symphonia", "tokio", "tracing", - "uuid", ] [[package]] @@ -2918,7 +2917,6 @@ dependencies = [ "serde_json", "strum 0.28.0", "thiserror", - "tracing", "type-map", "uuid", ] @@ -2960,8 +2958,6 @@ dependencies = [ "tokio-util", "toml", "tracing", - "type-map", - "unicode-normalization", "uuid", "validator", ] @@ -3001,7 +2997,6 @@ dependencies = [ "toml", "tracing", "unicode-normalization", - "uuid", ] [[package]] @@ -3026,7 +3021,6 @@ name = "nvisy-ocr" version = "0.1.0" dependencies = [ "async-trait", - "base64", "bentoml", "bytes", "futures", @@ -3051,7 +3045,6 @@ dependencies = [ "serde", "tokio", "toml", - "type-map", ] [[package]] @@ -3060,7 +3053,6 @@ version = "0.1.0" dependencies = [ "aide", "axum", - "base64", "derive_more", "futures", "humantime-serde", @@ -3068,7 +3060,6 @@ dependencies = [ "nvisy-core", "nvisy-engine", "schemars", - "semver", "serde", "tokio", "tower", @@ -3095,8 +3086,6 @@ dependencies = [ "aes-gcm", "async-trait", "base64", - "bytes", - "hipstr", "nvisy-codec", "nvisy-core", "nvisy-fake", diff --git a/Cargo.toml b/Cargo.toml index 0c74b1d1..0e9387ec 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,40 +43,17 @@ nvisy-core = { path = "./crates/nvisy-core", version = "0.1.0" } nvisy-engine = { path = "./crates/nvisy-engine", version = "0.1.0" } nvisy-fake = { path = "./crates/nvisy-fake", version = "0.1.0" } nvisy-llm = { path = "./crates/nvisy-llm", version = "0.1.0" } -nvisy-stt = { path = "./crates/nvisy-stt", version = "0.1.0" } -nvisy-toolkit = { path = "./crates/nvisy-toolkit", version = "0.1.0" } nvisy-ner = { path = "./crates/nvisy-ner", version = "0.1.0" } nvisy-ocr = { path = "./crates/nvisy-ocr", version = "0.1.0" } nvisy-pattern = { path = "./crates/nvisy-pattern", version = "0.1.0" } nvisy-server = { path = "./crates/nvisy-server", version = "0.1.0" } +nvisy-stt = { path = "./crates/nvisy-stt", version = "0.1.0" } +nvisy-toolkit = { path = "./crates/nvisy-toolkit", version = "0.1.0" } -# Inference & AI frameworks -bentoml = { version = "0.5", default-features = false, features = ["rustls-tls", "tracing"] } -rig = { version = "0.38", features = [], default-features = false } - -# HTTP client and middleware -reqwest = { version = "0.13", default-features = false, features = ["json", "rustls", "multipart"] } -reqwest-middleware = { version = "0.5", features = ["json", "multipart"] } -reqwest-retry = { version = "0.9", features = [] } -reqwest-tracing = { version = "0.7", features = [] } - -# Async runtime and parallelism -tokio = { version = "1.50", features = [] } -tokio-util = { version = "0.7", features = [] } -futures = { version = "0.3", features = [] } -async-trait = { version = "0.1", features = [] } -rayon = { version = "1.10", features = [] } - -# Observability -tracing = { version = "0.1", features = ["attributes"] } -tracing-subscriber = { version = "0.3", features = [] } - -# (De)serialization +# Serialization serde = { version = "1.0", features = ["derive"] } serde_json = { version = "1.0", features = [] } -serde_with = { version = "3.18", features = [] } schemars = { version = "1.0", features = ["uuid1", "bytes1"] } -csv = { version = "1.0", features = [] } toml = { version = "1.1", features = [] } minijinja = { version = "2.5", features = [] } @@ -100,32 +77,37 @@ type-map = { version = "0.5", features = [] } # Encoding and hashing base64 = { version = "0.22", features = [] } +hex = { version = "0.4", features = [] } sha2 = { version = "0.11", features = [] } aes-gcm = { version = "0.10", features = [] } -hmac = { version = "0.13", features = [] } -hex = { version = "0.4", features = [] } -# Pattern matching +# Async runtime and parallelism +tokio = { version = "1.50", features = [] } +tokio-util = { version = "0.7", features = [] } +futures = { version = "0.3", features = [] } +async-trait = { version = "0.1", features = [] } +rayon = { version = "1.10", features = [] } + +# Observability +tracing = { version = "0.1", features = ["attributes"] } +tracing-subscriber = { version = "0.3", features = [] } + +# Text processing (pattern matching, language detection, unicode) regex = { version = "1.0", features = [] } aho-corasick = { version = "1.0", features = [] } -smallvec = { version = "1.13", features = [] } - -# Language detection and text segmentation lingua = { version = "1.8", default-features = false, features = ["english"] } -stop-words = { version = "0.10", features = ["iso"] } unicode-segmentation = { version = "1.13", features = [] } unicode-normalization = { version = "0.1", features = [] } -# PDF processing (parsing, text extraction, page-to-image rendering) -lopdf = { version = "0.41", features = [] } -pdfium-render = { version = "0.9", features = [] } +# Tabular document parsing +csv = { version = "1.0", features = [] } +calamine = { version = "0.35", features = [] } -# Document parsing +# Rich-document parsing (HTML, PDF) scraper = { version = "0.27", features = [] } ego-tree = { version = "0.11", features = [] } -calamine = { version = "0.35", features = [] } -zip = { version = "8.4", features = [] } -quick-xml = { version = "0.40", features = [] } +lopdf = { version = "0.41", features = [] } +pdfium-render = { version = "0.9", features = [] } # Image processing image = { version = "0.25", default-features = false, features = ["png", "jpeg", "tiff"] } @@ -136,27 +118,30 @@ hound = { version = "3.5", features = [] } symphonia = { version = "0.6", default-features = false, features = ["wav", "pcm", "mp3"] } mp3lame-encoder = { version = "0.2", features = [] } -# CLI -clap = { version = "4.6", features = [] } +# AI / LLM frameworks +bentoml = { version = "0.5", default-features = false, features = ["rustls-tls", "tracing"] } +rig = { version = "0.38", features = [], default-features = false } + +# HTTP client and middleware +reqwest-middleware = { version = "0.5", features = ["json", "multipart"] } +reqwest-retry = { version = "0.9", features = [] } +reqwest-tracing = { version = "0.7", features = [] } -# HTTP server +# HTTP server and middleware axum = { version = "0.8", features = [] } aide = { version = "0.16.0-alpha.2", features = [] } tower = { version = "0.5", features = [] } tower-http = { version = "0.6", features = [] } -# Filesystem traversal -walkdir = { version = "2.5", features = [] } +# CLI +clap = { version = "4.6", features = [] } -# Storage, file detection, and asset embedding +# Storage and file-type detection fjall = { version = "3.1", features = [] } -include_dir = { version = "0.7", features = [] } infer = { version = "0.19", features = [] } # Utilities validator = { version = "0.20", features = ["derive"] } rand = { version = "0.10", features = [] } tempfile = { version = "3.27", features = [] } - -# Fake data generation fake = { version = "5.1", features = [] } diff --git a/crates/nvisy-cli/Cargo.toml b/crates/nvisy-cli/Cargo.toml index 7bb728cb..c8fa9aec 100644 --- a/crates/nvisy-cli/Cargo.toml +++ b/crates/nvisy-cli/Cargo.toml @@ -58,20 +58,16 @@ path = "src/main.rs" nvisy-engine = { workspace = true, features = [] } nvisy-server = { workspace = true, features = [] } -# CLI -clap = { workspace = true, features = ["derive", "env"] } - -# (De)serialization +# Serialization serde = { workspace = true, features = [] } toml = { workspace = true, features = [] } -humantime = { workspace = true, features = [] } humantime-serde = { workspace = true, features = [] } # Derive macros and error handling anyhow = { workspace = true, features = [] } -# HTTP server -axum = { workspace = true, features = ["tokio"] } +# Primitive datatypes +humantime = { workspace = true, features = [] } # Async runtime and parallelism tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } @@ -80,5 +76,11 @@ tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] } tracing = { workspace = true, features = [] } tracing-subscriber = { workspace = true, features = ["env-filter", "json"] } +# HTTP server and middleware +axum = { workspace = true, features = ["tokio"] } + +# CLI +clap = { workspace = true, features = ["derive", "env"] } + [package.metadata.cargo-machete] ignored = ["humantime-serde"] diff --git a/crates/nvisy-codec/Cargo.toml b/crates/nvisy-codec/Cargo.toml index c94df43b..ff975296 100644 --- a/crates/nvisy-codec/Cargo.toml +++ b/crates/nvisy-codec/Cargo.toml @@ -100,27 +100,39 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# Async runtime -async-trait = { workspace = true, features = [] } -tokio = { workspace = true, features = ["sync"] } - -# (De)serialization +# Serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } +schemars = { workspace = true, features = [] } + +# Derive macros and error handling +derive_more = { workspace = true, features = ["as_ref", "deref", "from"] } # Primitive datatypes bytes = { workspace = true, features = [] } -uuid = { workspace = true, features = [] } -derive_more = { workspace = true, features = ["as_ref", "deref", "from"] } + +# Encoding and hashing hex = { workspace = true, features = [] } -infer = { workspace = true, features = [] } -schemars = { workspace = true, features = [] } sha2 = { workspace = true, features = [] } -# Image processing — pulled in unconditionally because the image -# handler structs reference `image::DynamicImage` directly. The -# workspace dep already enables png/jpeg/tiff decoders. `imageproc` -# powers the per-region gaussian blur in `image::redact`. +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } +tokio = { workspace = true, features = ["sync"] } +rayon = { workspace = true, optional = true, features = [] } + +# Observability +tracing = { workspace = true, features = [] } + +# Tabular document parsing (feature-gated) +csv = { workspace = true, optional = true, features = [] } + +# Rich-document parsing (feature-gated: HTML + PDF) +scraper = { workspace = true, optional = true, features = [] } +ego-tree = { workspace = true, optional = true, features = [] } +lopdf = { workspace = true, optional = true, features = [] } +pdfium-render = { workspace = true, optional = true, features = [] } + +# Image processing image = { workspace = true, features = [] } imageproc = { workspace = true, features = [] } @@ -129,18 +141,8 @@ hound = { workspace = true, optional = true, features = [] } symphonia = { workspace = true, optional = true, features = [] } mp3lame-encoder = { workspace = true, optional = true, features = [] } -# PDF processing (feature-gated) -lopdf = { workspace = true, optional = true, features = [] } -pdfium-render = { workspace = true, optional = true, features = [] } -rayon = { workspace = true, optional = true, features = [] } - -# Document parsing (feature-gated) -csv = { workspace = true, optional = true, features = [] } -scraper = { workspace = true, optional = true, features = [] } -ego-tree = { workspace = true, optional = true, features = [] } - -# Observability -tracing = { workspace = true, features = [] } +# Storage and file-type detection +infer = { workspace = true, features = [] } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index 3c1cee13..e815c4ca 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -29,26 +29,23 @@ default = [] test-utils = [] [dependencies] -# (De)serialization +# Serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } schemars = { workspace = true, features = [] } -# Primitive datatypes -bytes = { workspace = true, features = [] } -hipstr = { workspace = true, features = [] } -uuid = { workspace = true, features = [] } -oxilangtag = { workspace = true, features = [] } -type-map = { workspace = true, features = [] } - # Derive macros and error handling thiserror = { workspace = true, features = [] } derive_builder = { workspace = true, features = [] } derive_more = { workspace = true, features = ["as_ref", "deref", "deref_mut", "display", "from", "from_str", "into", "into_iterator", "is_variant"] } strum = { workspace = true, features = [] } -# Async trait sugar (object-safe async methods) -async-trait = { workspace = true, features = [] } +# Primitive datatypes +uuid = { workspace = true, features = [] } +bytes = { workspace = true, features = [] } +hipstr = { workspace = true, features = [] } +oxilangtag = { workspace = true, features = [] } +type-map = { workspace = true, features = [] } -# Observability -tracing = { workspace = true, features = [] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index 5f516ed7..cc4dcd90 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -39,69 +39,56 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates +nvisy-codec = { workspace = true, features = ["text"] } nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } -nvisy-toolkit = { workspace = true, features = [] } -nvisy-codec = { workspace = true, features = ["text"] } - -# Detection + extraction backends used by the phase wiring +nvisy-llm = { workspace = true, features = [] } nvisy-ner = { workspace = true, features = [] } nvisy-ocr = { workspace = true, features = [] } nvisy-pattern = { workspace = true, features = [] } -nvisy-llm = { workspace = true, features = [] } nvisy-stt = { workspace = true, features = [] } +nvisy-toolkit = { workspace = true, features = [] } -# Storage -fjall = { workspace = true, features = [] } - -# Encoding -base64 = { workspace = true, features = [] } - -# Cheap-clone strings (refcounted / inline / borrowed). Used on -# policy / rule names and the audit's [`PolicyDecisionRef`] so -# audit-heavy passes share refcounts rather than allocating -# per-entity. -hipstr = { workspace = true, features = [] } - -# (De)serialization +# Serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } schemars = { workspace = true, features = [] } toml = { workspace = true, features = [] } humantime-serde = { workspace = true, features = [] } -# Async runtime -async-trait = { workspace = true, features = [] } -futures = { workspace = true, features = [] } -tokio = { workspace = true, features = ["rt", "sync", "time", "macros"] } -tokio-util = { workspace = true, features = [] } +# Derive macros and error handling +anyhow = { workspace = true, features = [] } +derive_builder = { workspace = true, features = [] } +derive_more = { workspace = true, features = ["deref", "deref_mut", "display", "from", "from_str", "into", "into_iterator"] } +strum = { workspace = true, features = [] } # Primitive datatypes uuid = { workspace = true, features = [] } +bytes = { workspace = true, features = [] } +hipstr = { workspace = true, features = [] } jiff = { workspace = true, features = [] } semver = { workspace = true, features = ["serde"] } -bytes = { workspace = true, features = [] } -# Heterogeneous container for PolicyStore -type-map = { workspace = true, features = [] } +# Encoding and hashing +base64 = { workspace = true, features = [] } +aes-gcm = { workspace = true, features = [] } -# Derive macros and error handling -anyhow = { workspace = true, features = [] } -derive_builder = { workspace = true, features = [] } -derive_more = { workspace = true, features = ["deref", "deref_mut", "display", "from", "from_str", "into", "into_iterator"] } -strum = { workspace = true, features = [] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } +futures = { workspace = true, features = [] } +tokio = { workspace = true, features = ["rt", "sync", "time", "macros"] } +tokio-util = { workspace = true, features = [] } -# Encryption -aes-gcm = { workspace = true, features = [] } +# Observability +tracing = { workspace = true, features = [] } + +# Storage and file-type detection +fjall = { workspace = true, features = [] } # Utilities validator = { workspace = true, features = [] } rand = { workspace = true, features = [] } tempfile = { workspace = true, features = [], optional = true } -unicode-normalization = { workspace = true, features = [] } - -# Observability -tracing = { workspace = true, features = [] } [dev-dependencies] nvisy-engine = { path = ".", features = ["test-utils"] } diff --git a/crates/nvisy-fake/Cargo.toml b/crates/nvisy-fake/Cargo.toml index 2e979262..d2bdba91 100644 --- a/crates/nvisy-fake/Cargo.toml +++ b/crates/nvisy-fake/Cargo.toml @@ -26,14 +26,14 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# Fake data generation -fake = { workspace = true, features = [] } +# Primitive datatypes (UUIDv4 for fake `DeviceId`) +uuid = { workspace = true, features = ["v4"] } -# Async runtime +# Async runtime and parallelism async-trait = { workspace = true, features = [] } -# UUIDs for fake DeviceId -uuid = { workspace = true, features = ["v4"] } +# Utilities +fake = { workspace = true, features = [] } [dev-dependencies] nvisy-core = { workspace = true, features = ["test-utils"] } diff --git a/crates/nvisy-llm/Cargo.toml b/crates/nvisy-llm/Cargo.toml index 6d1ea5c5..6e12f04f 100644 --- a/crates/nvisy-llm/Cargo.toml +++ b/crates/nvisy-llm/Cargo.toml @@ -36,39 +36,41 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# LLM framework -rig = { workspace = true, features = ["derive", "reqwest-middleware"] } - -# Async runtime -async-trait = { workspace = true, features = [] } - -# HTTP client + middleware (shared retry/tracing layers). -reqwest-middleware = { workspace = true, features = [] } -reqwest-retry = { workspace = true, features = [] } -reqwest-tracing = { workspace = true, features = [] } -humantime-serde = { workspace = true, features = [] } - -# (De)serialization +# Serialization serde = { workspace = true, features = [] } serde_json = { workspace = true, features = [] } schemars = { workspace = true, features = [] } toml = { workspace = true, features = ["parse"] } minijinja = { workspace = true, features = [] } - -# Primitive datatypes -uuid = { workspace = true, features = [] } - -# Encoding and hashing -base64 = { workspace = true, features = [] } -unicode-normalization = { workspace = true, features = [] } +humantime-serde = { workspace = true, features = [] } # Derive macros and error handling derive_builder = { workspace = true, features = [] } derive_more = { workspace = true, features = ["add_assign"] } thiserror = { workspace = true, features = [] } +# Encoding and hashing +base64 = { workspace = true, features = [] } + +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } + # Observability tracing = { workspace = true, features = [] } +# Text processing +unicode-normalization = { workspace = true, features = [] } + +# AI / LLM frameworks +rig = { workspace = true, features = ["derive", "reqwest-middleware"] } + +# HTTP client and middleware (shared retry/tracing layers) +reqwest-middleware = { workspace = true, features = [] } +reqwest-retry = { workspace = true, features = [] } +reqwest-tracing = { workspace = true, features = [] } + [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } + +[package.metadata.cargo-machete] +ignored = ["humantime-serde"] diff --git a/crates/nvisy-ner/Cargo.toml b/crates/nvisy-ner/Cargo.toml index bad8fe13..6758bc16 100644 --- a/crates/nvisy-ner/Cargo.toml +++ b/crates/nvisy-ner/Cargo.toml @@ -35,29 +35,27 @@ rustdoc-args = ["--cfg", "docsrs"] nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } -# Async trait sugar -async-trait = { workspace = true, features = [] } - -# Inference & AI frameworks -bentoml = { workspace = true, optional = true } - -# Language detection -lingua = { workspace = true, features = [] } - -# (De)serialization +# Serialization serde = { workspace = true, features = ["derive"] } -# Builder derive macro +# Derive macros and error handling derive_builder = { workspace = true, features = [] } -# Heterogeneous typed map for shared-NLP artifacts +# Primitive datatypes +uuid = { workspace = true, features = ["v7"] } type-map = { workspace = true, features = [] } -# Identifiers -uuid = { workspace = true, features = ["v7"] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } +# Text processing (language detection) +lingua = { workspace = true, features = [] } + +# AI / LLM frameworks (feature-gated) +bentoml = { workspace = true, optional = true } + [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-ocr/Cargo.toml b/crates/nvisy-ocr/Cargo.toml index bc61457b..dc8999fd 100644 --- a/crates/nvisy-ocr/Cargo.toml +++ b/crates/nvisy-ocr/Cargo.toml @@ -36,24 +36,21 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# Async runtime and parallelism -async-trait = { workspace = true, features = [] } -futures = { workspace = true, features = [] } -tokio = { workspace = true, features = ["rt"] } - -# BentoML inference client (feature-gated) -bentoml = { workspace = true, optional = true } - # Primitive datatypes (UUIDv7 for per-call request IDs from the # Bento backend; behind the `bento` feature gate at use sites). uuid = { workspace = true, features = ["v4", "v7"] } - -# Encoding -base64 = { workspace = true, features = [] } bytes = { workspace = true, features = [] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } +futures = { workspace = true, features = [] } +tokio = { workspace = true, features = ["rt"] } + # Observability tracing = { workspace = true, features = [] } +# AI / LLM frameworks (feature-gated) +bentoml = { workspace = true, optional = true } + [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 574fadf8..1c17cc80 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -27,22 +27,23 @@ rustdoc-args = ["--cfg", "docsrs"] nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } -# (De)serialization -csv = { workspace = true, features = [] } +# Serialization serde = { workspace = true, features = [] } -toml = { workspace = true, features = ["parse"] } schemars = { workspace = true, features = [] } +toml = { workspace = true, features = ["parse"] } -# Derive macros +# Derive macros and error handling derive_builder = { workspace = true, features = [] } -# Pattern matching +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } + +# Text processing (regex + Aho-Corasick literal matching) regex = { workspace = true, features = [] } aho-corasick = { workspace = true, features = [] } -# Async trait sugar -async-trait = { workspace = true, features = [] } +# Tabular document parsing (dictionary loading from CSV) +csv = { workspace = true, features = [] } [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } -type-map = { workspace = true, features = [] } diff --git a/crates/nvisy-server/Cargo.toml b/crates/nvisy-server/Cargo.toml index 79a785cb..eb2e7628 100644 --- a/crates/nvisy-server/Cargo.toml +++ b/crates/nvisy-server/Cargo.toml @@ -50,34 +50,30 @@ rustdoc-args = ["--cfg", "docsrs"] nvisy-core = { workspace = true, features = [] } nvisy-engine = { workspace = true, features = [] } -# Async runtime -futures = { workspace = true, features = [] } - -# HTTP server -axum = { workspace = true, features = ["json", "multipart", "tokio"] } -aide = { workspace = true, features = ["axum", "axum-json", "axum-query", "scalar", "macros"] } -tower = { workspace = true, features = ["util", "timeout"] } -tower-http = { workspace = true, features = ["trace", "cors", "timeout", "request-id", "limit", "compression-gzip", "compression-br", "compression-zstd", "sensitive-headers", "catch-panic", "util"] } - -# (De)serialization +# Serialization serde = { workspace = true, features = [] } schemars = { workspace = true, features = [] } humantime-serde = { workspace = true, features = [] } -# Encoding and hashing -base64 = { workspace = true, features = [] } - -# Derive macros +# Derive macros and error handling derive_more = { workspace = true, features = ["deref", "display"] } # Primitive datatypes uuid = { workspace = true, features = [] } jiff = { workspace = true, features = [] } -semver = { workspace = true, features = [] } + +# Async runtime and parallelism +futures = { workspace = true, features = [] } # Observability tracing = { workspace = true, features = [] } +# HTTP server and middleware +axum = { workspace = true, features = ["json", "multipart", "tokio"] } +aide = { workspace = true, features = ["axum", "axum-json", "axum-query", "scalar", "macros"] } +tower = { workspace = true, features = ["util", "timeout"] } +tower-http = { workspace = true, features = ["trace", "cors", "timeout", "request-id", "limit", "compression-gzip", "compression-br", "compression-zstd", "sensitive-headers", "catch-panic", "util"] } + [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-stt/Cargo.toml b/crates/nvisy-stt/Cargo.toml index 1381a229..6890e789 100644 --- a/crates/nvisy-stt/Cargo.toml +++ b/crates/nvisy-stt/Cargo.toml @@ -29,12 +29,12 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# Async runtime -async-trait = { workspace = true, features = [] } - # Primitive datatypes uuid = { workspace = true, features = [] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } + # Observability tracing = { workspace = true, features = [] } diff --git a/crates/nvisy-toolkit/Cargo.toml b/crates/nvisy-toolkit/Cargo.toml index 86dd511d..88d17f47 100644 --- a/crates/nvisy-toolkit/Cargo.toml +++ b/crates/nvisy-toolkit/Cargo.toml @@ -46,39 +46,37 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-core = { workspace = true, features = [] } nvisy-codec = { workspace = true, features = [] } +nvisy-core = { workspace = true, features = [] } nvisy-llm = { workspace = true, features = [] } -nvisy-stt = { workspace = true, features = [] } nvisy-ner = { workspace = true, features = [] } nvisy-ocr = { workspace = true, features = [] } nvisy-pattern = { workspace = true, features = [] } +nvisy-stt = { workspace = true, features = [] } -# (De)serialization +# Serialization serde = { workspace = true, features = [] } schemars = { workspace = true, features = [] } -# Async runtime -async-trait = { workspace = true, features = [] } -tokio = { workspace = true, features = ["rt", "sync", "time", "macros"] } - # Primitive datatypes uuid = { workspace = true, features = [] } -bytes = { workspace = true, features = [] } -hipstr = { workspace = true, features = [] } type-map = { workspace = true, features = [] } -# Unicode-aware text folding for leak detection. -unicode-normalization = { workspace = true, features = [] } - -# Cryptography for built-in redaction operators +# Encoding and hashing +base64 = { workspace = true, features = [] } sha2 = { workspace = true, features = [] } aes-gcm = { workspace = true, features = ["aes", "alloc"], optional = true } -base64 = { workspace = true, features = [] } + +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } +tokio = { workspace = true, features = ["rt", "sync", "time", "macros"] } # Observability tracing = { workspace = true, features = [] } +# Text processing (unicode-aware folding for leak detection) +unicode-normalization = { workspace = true, features = [] } + [dev-dependencies] # Internal test utilities (Entity::test_builder, …). nvisy-core = { workspace = true, features = ["test-utils"] } From 682913cb2daea37dfdcaa944bff88f43d85f7f9a Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 14:07:12 +0200 Subject: [PATCH 03/14] style: cargo fmt Co-Authored-By: Claude Opus 4.7 --- crates/nvisy-engine/src/core/config.rs | 3 +-- crates/nvisy-engine/src/core/context.rs | 2 +- crates/nvisy-engine/src/core/ingestion/exporter.rs | 4 ++-- crates/nvisy-engine/src/core/ingestion/importer.rs | 6 +++--- crates/nvisy-engine/src/detection/config/mod.rs | 2 +- crates/nvisy-engine/src/detection/document.rs | 5 ++--- crates/nvisy-engine/src/detection/engine.rs | 2 +- crates/nvisy-engine/src/detection/orchestrator.rs | 5 ++--- .../src/detection/phases/deduplication.rs | 12 ++++++++---- .../nvisy-engine/src/detection/phases/detection.rs | 5 ++--- .../src/detection/phases/extraction.rs | 4 ++-- crates/nvisy-engine/src/detection/pipeline.rs | 8 +++----- crates/nvisy-engine/src/detection/plan.rs | 12 ++++++++---- crates/nvisy-engine/src/detection/result.rs | 2 +- crates/nvisy-engine/src/detection/state.rs | 14 +++++++++++--- crates/nvisy-engine/src/policy/audit.rs | 3 ++- crates/nvisy-engine/src/policy/redaction/mod.rs | 2 +- crates/nvisy-engine/src/policy/suppress.rs | 3 ++- crates/nvisy-engine/src/redaction/document.rs | 5 ++--- crates/nvisy-engine/src/redaction/engine.rs | 14 ++++---------- crates/nvisy-engine/src/redaction/orchestrator.rs | 5 ++--- crates/nvisy-engine/src/redaction/phases/phase.rs | 3 +-- .../src/redaction/phases/validation.rs | 3 +-- crates/nvisy-engine/src/redaction/pipeline.rs | 9 ++++----- crates/nvisy-engine/tests/redaction_policy.rs | 5 ++++- .../nvisy-server/src/handler/request/detections.rs | 2 +- 26 files changed, 72 insertions(+), 68 deletions(-) diff --git a/crates/nvisy-engine/src/core/config.rs b/crates/nvisy-engine/src/core/config.rs index a5918188..fcc4cffc 100644 --- a/crates/nvisy-engine/src/core/config.rs +++ b/crates/nvisy-engine/src/core/config.rs @@ -26,8 +26,7 @@ use std::num::NonZeroUsize; use std::time::Duration; -use nvisy_core::Error; -use nvisy_core::Result; +use nvisy_core::{Error, Result}; use nvisy_llm::backend::http::HttpConfig; use semver::Version; use serde::{Deserialize, Serialize}; diff --git a/crates/nvisy-engine/src/core/context.rs b/crates/nvisy-engine/src/core/context.rs index 865e8102..84391b2c 100644 --- a/crates/nvisy-engine/src/core/context.rs +++ b/crates/nvisy-engine/src/core/context.rs @@ -23,8 +23,8 @@ use nvisy_toolkit::extraction::ExtractorRegistry; use tokio_util::sync::CancellationToken; use super::SharedData; -use crate::redaction::phases::RedactionRegistries; use crate::redaction::RedactionConfig; +use crate::redaction::phases::RedactionRegistries; /// Shared surface every phase reads from regardless of which side /// (detection or redaction) it runs on. Implemented by both diff --git a/crates/nvisy-engine/src/core/ingestion/exporter.rs b/crates/nvisy-engine/src/core/ingestion/exporter.rs index 6bede14b..95b83940 100644 --- a/crates/nvisy-engine/src/core/ingestion/exporter.rs +++ b/crates/nvisy-engine/src/core/ingestion/exporter.rs @@ -12,11 +12,11 @@ use nvisy_codec::content::{Content, ContentData, ContentSource}; use nvisy_core::Result; use uuid::Uuid; -use crate::core::{AnyTree, DocumentTree, SharedData}; -use crate::modality::DocumentModality; use crate::core::ingestion::compression::CompressionService; use crate::core::ingestion::encryption::CryptoService; use crate::core::ingestion::{CompressionAlgorithm, EncryptionConfig}; +use crate::core::{AnyTree, DocumentTree, SharedData}; +use crate::modality::DocumentModality; const TARGET: &str = "nvisy_engine::op::export_file"; diff --git a/crates/nvisy-engine/src/core/ingestion/importer.rs b/crates/nvisy-engine/src/core/ingestion/importer.rs index 321c3a05..0c6da15b 100644 --- a/crates/nvisy-engine/src/core/ingestion/importer.rs +++ b/crates/nvisy-engine/src/core/ingestion/importer.rs @@ -31,15 +31,15 @@ use nvisy_core::entity::{Annotation, LabelAnnotation}; use nvisy_core::modality::{Audio, Image, Tabular, Text}; use nvisy_core::{Error, Result}; +use crate::core::ingestion::compression::CompressionService; +use crate::core::ingestion::encryption::{CryptoService, EncryptedContent}; +use crate::core::ingestion::{CompressionAlgorithm, EncryptionAlgorithm, EncryptionConfig}; use crate::core::{AnyTree, DocumentTree, SharedData}; use crate::document::{AnyAnnotations, Document}; use crate::modality::{ AudioExtraction, AudioMetadata, DocumentModality, ImageExtraction, ImageMetadata, TabularExtraction, TabularMetadata, TextExtraction, TextMetadata, }; -use crate::core::ingestion::compression::CompressionService; -use crate::core::ingestion::encryption::{CryptoService, EncryptedContent}; -use crate::core::ingestion::{CompressionAlgorithm, EncryptionAlgorithm, EncryptionConfig}; const TARGET: &str = "nvisy_engine::op::import_file"; diff --git a/crates/nvisy-engine/src/detection/config/mod.rs b/crates/nvisy-engine/src/detection/config/mod.rs index f9c77164..9dcd3a14 100644 --- a/crates/nvisy-engine/src/detection/config/mod.rs +++ b/crates/nvisy-engine/src/detection/config/mod.rs @@ -15,9 +15,9 @@ mod ner; mod pattern; +use nvisy_context::{ContextEnhancer, ContextRegistry}; #[cfg(not(feature = "bento"))] use nvisy_core::Error; -use nvisy_context::{ContextEnhancer, ContextRegistry}; use nvisy_core::Result; use nvisy_core::entity::EntityLabelCatalog; use nvisy_core::modality::Text; diff --git a/crates/nvisy-engine/src/detection/document.rs b/crates/nvisy-engine/src/detection/document.rs index 378906b8..ab9f2207 100644 --- a/crates/nvisy-engine/src/detection/document.rs +++ b/crates/nvisy-engine/src/detection/document.rs @@ -5,12 +5,11 @@ use nvisy_core::modality::{Audio, Image, Tabular, Text}; use nvisy_core::{Error, Result}; -use crate::core::PhaseContext as _; -use crate::core::{DetectionContext, DocumentTree}; +use crate::core::{DetectionContext, DocumentTree, PhaseContext as _}; +use crate::detection::DetectionPlan; use crate::detection::phases::deduplication::DeduplicationPhase; use crate::detection::phases::detection::DetectionPhase; use crate::detection::phases::extraction::ExtractionPhase; -use crate::detection::DetectionPlan; const TARGET: &str = "nvisy_engine::pipeline::detection::document"; diff --git a/crates/nvisy-engine/src/detection/engine.rs b/crates/nvisy-engine/src/detection/engine.rs index a4da3d61..ab644e50 100644 --- a/crates/nvisy-engine/src/detection/engine.rs +++ b/crates/nvisy-engine/src/detection/engine.rs @@ -23,8 +23,8 @@ use super::result::DetectionResult; use super::state::DetectionState; use super::status::DetectionStatus; use super::{DetectionEntry, DetectionFilter, DetectionInput, DetectionSnapshot}; -use crate::core::ingestion::encryption::SharedKeyProvider; use crate::core::RuntimeConfig; +use crate::core::ingestion::encryption::SharedKeyProvider; use crate::detection::{DetectionConfig, ExtractionConfig}; use crate::registry::Registry; diff --git a/crates/nvisy-engine/src/detection/orchestrator.rs b/crates/nvisy-engine/src/detection/orchestrator.rs index 3764c8ea..75f366ad 100644 --- a/crates/nvisy-engine/src/detection/orchestrator.rs +++ b/crates/nvisy-engine/src/detection/orchestrator.rs @@ -8,11 +8,10 @@ use tokio::sync::Semaphore; use tokio::task::JoinSet; use super::document::DetectionDocumentPipeline; -use crate::core::PhaseContext as _; -use crate::core::{AnyTree, DetectionContext}; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::{ImportFile, Importer}; +use crate::core::{AnyTree, DetectionContext, PhaseContext as _}; use crate::detection::DetectionPlan; +use crate::document::provenance::AnyAudit; const TARGET: &str = "nvisy_engine::pipeline::detection::orchestrator"; diff --git a/crates/nvisy-engine/src/detection/phases/deduplication.rs b/crates/nvisy-engine/src/detection/phases/deduplication.rs index fb2f3d6c..8c3dd742 100644 --- a/crates/nvisy-engine/src/detection/phases/deduplication.rs +++ b/crates/nvisy-engine/src/detection/phases/deduplication.rs @@ -17,11 +17,10 @@ use nvisy_toolkit::deduplication::{LayerContext, LayerPipeline, SpanSize}; use tracing::Instrument; use uuid::Uuid; -use crate::core::PhaseContext as _; -use crate::core::{DetectionContext, DocumentTree}; +use crate::core::{DetectionContext, DocumentTree, PhaseContext as _}; +use crate::detection::{DeduplicationParams, DetectionPlan}; use crate::document::provenance::EntityRecord; use crate::modality::DocumentModality; -use crate::detection::{DeduplicationParams, DetectionPlan}; const TARGET: &str = "nvisy_engine::deduplication"; @@ -73,7 +72,12 @@ impl DeduplicationPhase { self.run(ctx, plan, tree).await } - async fn run(&self, ctx: &DetectionContext, plan: &DetectionPlan, tree: &mut DocumentTree) -> Result<()> + async fn run( + &self, + ctx: &DetectionContext, + plan: &DetectionPlan, + tree: &mut DocumentTree, + ) -> Result<()> where M: DocumentModality, M::Location: Overlap + SpanSize, diff --git a/crates/nvisy-engine/src/detection/phases/detection.rs b/crates/nvisy-engine/src/detection/phases/detection.rs index ef60bf51..9857308b 100644 --- a/crates/nvisy-engine/src/detection/phases/detection.rs +++ b/crates/nvisy-engine/src/detection/phases/detection.rs @@ -23,11 +23,10 @@ use nvisy_core::recognition::RecognizerInput; use nvisy_toolkit::detection::RecognizerRegistry; use tracing::Instrument; -use crate::core::PhaseContext as _; -use crate::core::{DetectionContext, DocumentTree}; +use crate::core::{DetectionContext, DocumentTree, PhaseContext as _}; +use crate::detection::DetectionPlan; use crate::document::{Document, Span}; use crate::modality::{DocumentModality, ModalityBlock}; -use crate::detection::DetectionPlan; const TARGET: &str = "nvisy_engine::detection"; diff --git a/crates/nvisy-engine/src/detection/phases/extraction.rs b/crates/nvisy-engine/src/detection/phases/extraction.rs index e75243e7..244cb2e8 100644 --- a/crates/nvisy-engine/src/detection/phases/extraction.rs +++ b/crates/nvisy-engine/src/detection/phases/extraction.rs @@ -27,10 +27,10 @@ use nvisy_ocr::types::RawOcrBlock; use nvisy_toolkit::extraction::{Extractor, ExtractorRegistry, ImageExtractorOutput}; use tracing::Instrument; -use crate::core::{DocumentTree, DetectionContext}; +use crate::core::{DetectionContext, DocumentTree}; +use crate::detection::{DetectionPlan, Extraction}; use crate::document::{Block, Document, Span}; use crate::modality::{ImageBlock, TabularBlock, TextBlock, TextContent}; -use crate::detection::{DetectionPlan, Extraction}; const TARGET: &str = "nvisy_engine::extraction"; diff --git a/crates/nvisy-engine/src/detection/pipeline.rs b/crates/nvisy-engine/src/detection/pipeline.rs index 52b7fcff..be2c0118 100644 --- a/crates/nvisy-engine/src/detection/pipeline.rs +++ b/crates/nvisy-engine/src/detection/pipeline.rs @@ -20,12 +20,11 @@ use super::orchestrator::DetectionOrchestrator; use super::result::DetectionResult; use super::state::{DetectionRecord, DetectionState}; use super::status::DetectionStatus; -use crate::core::{DetectionContext, DetectionEngines, PolicyStore, SharedData}; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::ImportFile; use crate::core::ingestion::encryption::SharedKeyProvider; -use crate::core::RuntimeConfig; +use crate::core::{DetectionContext, DetectionEngines, PolicyStore, RuntimeConfig, SharedData}; use crate::detection::DetectionConfig; +use crate::document::provenance::AnyAudit; use crate::policy::{Policy, PolicyDigest}; use crate::registry::Registry; @@ -93,8 +92,7 @@ impl DetectionPipeline { input.validate_actions()?; let actor_id = input.actor_id; - let policy_digests: Vec = - input.policies.iter().map(Policy::digest).collect(); + let policy_digests: Vec = input.policies.iter().map(Policy::digest).collect(); let policies = Arc::new(PolicyStore::from_policies(input.policies)); self.detections diff --git a/crates/nvisy-engine/src/detection/plan.rs b/crates/nvisy-engine/src/detection/plan.rs index 6dea187f..77777a81 100644 --- a/crates/nvisy-engine/src/detection/plan.rs +++ b/crates/nvisy-engine/src/detection/plan.rs @@ -58,20 +58,24 @@ pub struct Extraction { /// Text-modality plan knobs. No tunables today; reserved for future /// per-call settings (e.g. whitespace normalization). -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] pub struct TextPlan {} /// Tabular-modality plan knobs. No tunables today. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] pub struct TabularPlan {} /// Image-modality plan knobs. No tunables today; reserved for /// future OCR tuning (e.g. language hint, page subset). -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] pub struct ImagePlan {} /// Audio-modality plan knobs (speech-to-text). -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] pub struct AudioPlan { /// Segment the audio by speaker identity. /// diff --git a/crates/nvisy-engine/src/detection/result.rs b/crates/nvisy-engine/src/detection/result.rs index 2b5e80c7..a2d832e7 100644 --- a/crates/nvisy-engine/src/detection/result.rs +++ b/crates/nvisy-engine/src/detection/result.rs @@ -7,8 +7,8 @@ use serde::{Deserialize, Serialize}; use uuid::Uuid; use super::status::DetectionStatus; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::ImportFile; +use crate::document::provenance::AnyAudit; use crate::policy::PolicyDigest; /// Immutable artifact produced by one detection pass. diff --git a/crates/nvisy-engine/src/detection/state.rs b/crates/nvisy-engine/src/detection/state.rs index 8f8169b4..4109a26c 100644 --- a/crates/nvisy-engine/src/detection/state.rs +++ b/crates/nvisy-engine/src/detection/state.rs @@ -17,8 +17,8 @@ use uuid::Uuid; use super::result::{DetectionEntry, DetectionFilter, DetectionResult, DetectionSnapshot}; use super::status::DetectionStatus; use crate::core::PolicyStore; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::ImportFile; +use crate::document::provenance::AnyAudit; use crate::policy::PolicyDigest; const TARGET: &str = "nvisy_engine::detection::state"; @@ -131,7 +131,11 @@ impl DetectionState { /// callers). /// /// [`ErrorKind::NotFound`]: nvisy_core::ErrorKind::NotFound - pub(crate) async fn snapshot(&self, actor_id: Uuid, id: Uuid) -> Result { + pub(crate) async fn snapshot( + &self, + actor_id: Uuid, + id: Uuid, + ) -> Result { let guard = self.inner.read().await; let Some(record) = guard.get(&id) else { return Err(Error::not_found( @@ -206,7 +210,11 @@ impl DetectionState { }) } - pub(crate) async fn list(&self, actor_id: Uuid, filter: DetectionFilter) -> Vec { + pub(crate) async fn list( + &self, + actor_id: Uuid, + filter: DetectionFilter, + ) -> Vec { let guard = self.inner.read().await; let mut out: Vec = guard .iter() diff --git a/crates/nvisy-engine/src/policy/audit.rs b/crates/nvisy-engine/src/policy/audit.rs index 46d63096..85264fa0 100644 --- a/crates/nvisy-engine/src/policy/audit.rs +++ b/crates/nvisy-engine/src/policy/audit.rs @@ -15,7 +15,8 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; /// Payload for the `audit` action. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct AuditAction { /// Severity hint propagated into the audit entry — e.g. diff --git a/crates/nvisy-engine/src/policy/redaction/mod.rs b/crates/nvisy-engine/src/policy/redaction/mod.rs index 46ac8040..254e59f6 100644 --- a/crates/nvisy-engine/src/policy/redaction/mod.rs +++ b/crates/nvisy-engine/src/policy/redaction/mod.rs @@ -24,8 +24,8 @@ mod image; mod tabular; mod text; -pub use nvisy_toolkit::redaction::anonymizer::HashAlgorithm; use nvisy_core::modality::{Audio, Image, Tabular, Text}; +pub use nvisy_toolkit::redaction::anonymizer::HashAlgorithm; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; diff --git a/crates/nvisy-engine/src/policy/suppress.rs b/crates/nvisy-engine/src/policy/suppress.rs index a6543e0a..99fa6209 100644 --- a/crates/nvisy-engine/src/policy/suppress.rs +++ b/crates/nvisy-engine/src/policy/suppress.rs @@ -11,7 +11,8 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; /// Payload for the `suppress` action. -#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize, JsonSchema)] +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct SuppressAction { /// Human-readable reason the entity is being suppressed. Surfaced diff --git a/crates/nvisy-engine/src/redaction/document.rs b/crates/nvisy-engine/src/redaction/document.rs index ef8acda7..f5c802bb 100644 --- a/crates/nvisy-engine/src/redaction/document.rs +++ b/crates/nvisy-engine/src/redaction/document.rs @@ -4,11 +4,10 @@ use nvisy_core::modality::{Audio, Image, Tabular, Text}; use nvisy_core::{Error, Result}; -use crate::core::PhaseContext as _; -use crate::core::{DocumentTree, RedactionContext}; +use crate::core::{DocumentTree, PhaseContext as _, RedactionContext}; +use crate::redaction::RedactionPlan; use crate::redaction::phases::RedactionPhase; use crate::redaction::phases::validation::ValidationPhase; -use crate::redaction::RedactionPlan; const TARGET: &str = "nvisy_engine::pipeline::redaction::document"; diff --git a/crates/nvisy-engine/src/redaction/engine.rs b/crates/nvisy-engine/src/redaction/engine.rs index 836e580b..75ba09ba 100644 --- a/crates/nvisy-engine/src/redaction/engine.rs +++ b/crates/nvisy-engine/src/redaction/engine.rs @@ -26,11 +26,11 @@ use super::result::RedactionResult; use super::state::RedactionState; use super::status::RedactionStatus; use super::{RedactionEntry, RedactionFilter, RedactionInput, RedactionSnapshot}; -use crate::core::ingestion::encryption::SharedKeyProvider; -use crate::redaction::phases::RedactionRegistries; use crate::core::RuntimeConfig; -use crate::redaction::RedactionConfig; +use crate::core::ingestion::encryption::SharedKeyProvider; use crate::detection::{DetectionEngine, DetectionState}; +use crate::redaction::RedactionConfig; +use crate::redaction::phases::RedactionRegistries; use crate::registry::Registry; /// Shared inner state for a [`RedactionEngine`], held behind an @@ -91,13 +91,7 @@ impl RedactionEngine { /// [`DetectionState`]: crate::detection::DetectionState /// [`shutdown`]: Self::shutdown pub fn from_detection(detection: &DetectionEngine) -> Self { - let redaction_config = Arc::new( - detection - .config() - .redaction - .clone() - .unwrap_or_default(), - ); + let redaction_config = Arc::new(detection.config().redaction.clone().unwrap_or_default()); Self { inner: Arc::new(RedactionInner { runtime_config: detection.config().clone(), diff --git a/crates/nvisy-engine/src/redaction/orchestrator.rs b/crates/nvisy-engine/src/redaction/orchestrator.rs index 2a8a49dd..ceb52111 100644 --- a/crates/nvisy-engine/src/redaction/orchestrator.rs +++ b/crates/nvisy-engine/src/redaction/orchestrator.rs @@ -19,10 +19,9 @@ use tokio::sync::Semaphore; use tokio::task::JoinSet; use super::document::RedactionDocumentPipeline; -use crate::core::PhaseContext as _; -use crate::core::{AnyTree, RedactionContext}; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::{ExportFile, Exporter, ImportFile, Importer}; +use crate::core::{AnyTree, PhaseContext as _, RedactionContext}; +use crate::document::provenance::AnyAudit; use crate::redaction::RedactionPlan; const TARGET: &str = "nvisy_engine::pipeline::redaction::orchestrator"; diff --git a/crates/nvisy-engine/src/redaction/phases/phase.rs b/crates/nvisy-engine/src/redaction/phases/phase.rs index a52c1ceb..87fa286f 100644 --- a/crates/nvisy-engine/src/redaction/phases/phase.rs +++ b/crates/nvisy-engine/src/redaction/phases/phase.rs @@ -14,8 +14,7 @@ use nvisy_core::Result; use nvisy_core::modality::{Audio, Image, Tabular, Text}; -use crate::core::PhaseContext as _; -use crate::core::{DocumentTree, RedactionContext}; +use crate::core::{DocumentTree, PhaseContext as _, RedactionContext}; use crate::redaction::phases::registries::RedactionRegistries; use crate::redaction::phases::run_redaction; use crate::redaction::{RedactionConfig, RedactionPlan}; diff --git a/crates/nvisy-engine/src/redaction/phases/validation.rs b/crates/nvisy-engine/src/redaction/phases/validation.rs index ccccc9ae..4baa3457 100644 --- a/crates/nvisy-engine/src/redaction/phases/validation.rs +++ b/crates/nvisy-engine/src/redaction/phases/validation.rs @@ -20,8 +20,7 @@ use nvisy_toolkit::validation::{ }; use tracing::Instrument; -use crate::core::PhaseContext as _; -use crate::core::{DocumentTree, RedactionContext}; +use crate::core::{DocumentTree, PhaseContext as _, RedactionContext}; use crate::document::Document; use crate::modality::DocumentModality; use crate::redaction::RedactionPlan; diff --git a/crates/nvisy-engine/src/redaction/pipeline.rs b/crates/nvisy-engine/src/redaction/pipeline.rs index df1cb389..15c0ebd6 100644 --- a/crates/nvisy-engine/src/redaction/pipeline.rs +++ b/crates/nvisy-engine/src/redaction/pipeline.rs @@ -14,13 +14,12 @@ use super::orchestrator::RedactionOrchestrator; use super::result::RedactionResult; use super::state::{RedactionRecord, RedactionState}; use super::status::RedactionStatus; -use crate::core::{RedactionContext, RedactionEngines, SharedData}; -use crate::document::provenance::AnyAudit; use crate::core::ingestion::encryption::SharedKeyProvider; -use crate::redaction::phases::RedactionRegistries; -use crate::redaction::RedactionConfig; -use crate::core::RuntimeConfig; +use crate::core::{RedactionContext, RedactionEngines, RuntimeConfig, SharedData}; use crate::detection::DetectionState; +use crate::document::provenance::AnyAudit; +use crate::redaction::RedactionConfig; +use crate::redaction::phases::RedactionRegistries; use crate::registry::Registry; const TARGET: &str = "nvisy_engine::pipeline::redaction::pipeline"; diff --git a/crates/nvisy-engine/tests/redaction_policy.rs b/crates/nvisy-engine/tests/redaction_policy.rs index b8f3cf6b..1e72ab7e 100644 --- a/crates/nvisy-engine/tests/redaction_policy.rs +++ b/crates/nvisy-engine/tests/redaction_policy.rs @@ -118,7 +118,10 @@ fn action_redact_round_trips_through_json() { ..Default::default() }); let json = serde_json::to_string(&action).expect("serialize"); - assert!(json.contains("\"redact\""), "expected redact tag, got {json}"); + assert!( + json.contains("\"redact\""), + "expected redact tag, got {json}" + ); assert!( json.contains("\"text\""), "expected text operator, got {json}" diff --git a/crates/nvisy-server/src/handler/request/detections.rs b/crates/nvisy-server/src/handler/request/detections.rs index e68b05eb..379c3f04 100644 --- a/crates/nvisy-server/src/handler/request/detections.rs +++ b/crates/nvisy-server/src/handler/request/detections.rs @@ -13,7 +13,7 @@ use crate::handler::request::pagination::Pagination; #[derive(Debug, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct NewDetection { - /// Policies to apply, in precedence order + /// Policies to apply, in precedence order /// (index 0 is highest precedence). pub policies: Vec, /// Content sources to ingest at the start of the pass. From 111bbac0819834141943027dcfc8c460fbc510b7 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 21:26:54 +0200 Subject: [PATCH 04/14] =?UTF-8?q?refactor(pattern,context):=20rename=20Pat?= =?UTF-8?q?tern=E2=86=92Regex,=20inline=20registries,=20normalize=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvisy-pattern: rename Detector→Pattern→Regex; inline PatternRegistry into PatternRecognizerBuilder; split CompiledPattern out; export built-in validators by bare-noun names (luhn, iban, ssn, phone, date); add Scoring::get + per-column resolution; convert pattern assets to TOML; normalize module/function docs (returns-form for predicates, reference-form doc-links, # Errors + code examples for public types). - nvisy-context: extract registry/declaration into rule + wrapper; trim enhancer/matcher/tokens surface. - nvisy-toolkit: drop stale PatternRegistry usage in pipeline example; fix broken rustdoc links in redaction module. - nvisy-engine, nvisy-ner: knock-on updates for the new pattern and context surfaces. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 10 +- crates/nvisy-context/Cargo.toml | 14 +- crates/nvisy-context/src/declaration.rs | 89 --- crates/nvisy-context/src/enhancer.rs | 718 ++++++++++++------ crates/nvisy-context/src/lib.rs | 10 +- crates/nvisy-context/src/matcher.rs | 117 ++- crates/nvisy-context/src/registry.rs | 122 --- crates/nvisy-context/src/rule.rs | 140 ++++ crates/nvisy-context/src/tokens.rs | 151 +--- crates/nvisy-context/src/wrapper.rs | 77 ++ crates/nvisy-engine/Cargo.toml | 1 - crates/nvisy-engine/src/core/context.rs | 17 - .../nvisy-engine/src/detection/config/mod.rs | 57 +- crates/nvisy-engine/src/detection/document.rs | 5 +- crates/nvisy-engine/src/detection/mod.rs | 4 +- .../src/detection/phases/detection.rs | 30 +- crates/nvisy-engine/src/detection/pipeline.rs | 5 +- crates/nvisy-ner/Cargo.toml | 1 - crates/nvisy-ner/src/nlp/capabilities.rs | 2 +- crates/nvisy-ner/src/nlp/engine.rs | 22 +- crates/nvisy-ner/src/nlp/mod.rs | 18 +- crates/nvisy-ner/src/recognition/config.rs | 12 - .../nvisy-ner/src/recognition/recognizer.rs | 27 +- crates/nvisy-pattern/Cargo.toml | 1 - crates/nvisy-pattern/README.md | 41 +- .../dictionaries/general/languages.toml | 15 +- .../assets/patterns/contact/email.toml | 2 + .../assets/patterns/contact/phone.toml | 7 +- .../assets/patterns/contact/url.toml | 2 + .../assets/patterns/credentials/aws_key.toml | 2 + .../patterns/credentials/generic_api_key.toml | 2 + .../patterns/credentials/github_token.toml | 2 + .../patterns/credentials/private_key.toml | 2 + .../patterns/credentials/stripe_key.toml | 2 + .../patterns/finance/bitcoin_address.toml | 2 + .../assets/patterns/finance/credit_card.toml | 6 +- .../patterns/finance/ethereum_address.toml | 2 + .../assets/patterns/finance/iban.toml | 6 +- .../assets/patterns/finance/swift_code.toml | 2 + .../patterns/finance/us_bank_routing.toml | 2 + .../assets/patterns/identity/ssn.toml | 6 +- .../patterns/identity/us_drivers_license.toml | 2 + .../assets/patterns/identity/us_passport.toml | 2 + .../patterns/identity/us_postal_code.toml | 2 + .../assets/patterns/network/ipv4.toml | 2 + .../assets/patterns/network/ipv6.toml | 2 + .../assets/patterns/network/mac_address.toml | 2 + .../patterns/personal/date_of_birth.toml | 7 +- .../assets/patterns/personal/datetime.toml | 6 +- crates/nvisy-pattern/src/lib.rs | 4 +- .../nvisy-pattern/src/recognition/compiled.rs | 166 ++++ .../src/recognition/dictionary.rs | 174 +++-- crates/nvisy-pattern/src/recognition/mod.rs | 25 +- .../src/recognition/recognizer.rs | 485 ++++++------ crates/nvisy-pattern/src/recognition/regex.rs | 149 ++++ .../src/recognition/regex_rule.rs | 107 --- .../nvisy-pattern/src/recognition/registry.rs | 167 ---- crates/nvisy-pattern/src/recognition/terms.rs | 163 ++-- crates/nvisy-pattern/src/shipped/mod.rs | 8 +- crates/nvisy-pattern/src/shipped/patterns.rs | 2 +- crates/nvisy-pattern/src/validators/date.rs | 41 +- crates/nvisy-pattern/src/validators/iban.rs | 22 +- crates/nvisy-pattern/src/validators/luhn.rs | 26 +- crates/nvisy-pattern/src/validators/mod.rs | 44 +- crates/nvisy-pattern/src/validators/phone.rs | 33 +- crates/nvisy-pattern/src/validators/ssn.rs | 28 +- .../testdata/patterns/employee_id.toml | 2 + .../testdata/patterns/product_codes.toml | 2 + .../nvisy-pattern/tests/enhancer_roundtrip.rs | 80 +- .../nvisy-pattern/tests/shipped_detection.rs | 15 +- crates/nvisy-pattern/tests/user_rules.rs | 28 +- crates/nvisy-toolkit/Cargo.toml | 2 + crates/nvisy-toolkit/examples/pipeline.rs | 5 +- .../src/redaction/deanonymizer/mod.rs | 3 +- crates/nvisy-toolkit/src/redaction/mod.rs | 10 +- .../tests/fixtures/registries.rs | 11 +- .../tests/recognition_registry.rs | 7 +- 77 files changed, 1891 insertions(+), 1696 deletions(-) delete mode 100644 crates/nvisy-context/src/declaration.rs delete mode 100644 crates/nvisy-context/src/registry.rs create mode 100644 crates/nvisy-context/src/rule.rs create mode 100644 crates/nvisy-context/src/wrapper.rs create mode 100644 crates/nvisy-pattern/src/recognition/compiled.rs create mode 100644 crates/nvisy-pattern/src/recognition/regex.rs delete mode 100644 crates/nvisy-pattern/src/recognition/regex_rule.rs delete mode 100644 crates/nvisy-pattern/src/recognition/registry.rs diff --git a/Cargo.lock b/Cargo.lock index bfed20b0..5d31d114 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2894,12 +2894,10 @@ dependencies = [ name = "nvisy-context" version = "0.1.0" dependencies = [ - "derive_builder", + "async-trait", "hipstr", "nvisy-core", - "schemars", - "serde", - "thiserror", + "unicode-segmentation", ] [[package]] @@ -2938,7 +2936,6 @@ dependencies = [ "humantime-serde", "jiff", "nvisy-codec", - "nvisy-context", "nvisy-core", "nvisy-engine", "nvisy-llm", @@ -3007,7 +3004,6 @@ dependencies = [ "bentoml", "derive_builder", "lingua", - "nvisy-context", "nvisy-core", "serde", "tokio", @@ -3041,7 +3037,6 @@ dependencies = [ "nvisy-context", "nvisy-core", "regex", - "schemars", "serde", "tokio", "toml", @@ -3087,6 +3082,7 @@ dependencies = [ "async-trait", "base64", "nvisy-codec", + "nvisy-context", "nvisy-core", "nvisy-fake", "nvisy-llm", diff --git a/crates/nvisy-context/Cargo.toml b/crates/nvisy-context/Cargo.toml index 8c53f2d1..3f564243 100644 --- a/crates/nvisy-context/Cargo.toml +++ b/crates/nvisy-context/Cargo.toml @@ -26,16 +26,14 @@ rustdoc-args = ["--cfg", "docsrs"] # Internal crates nvisy-core = { workspace = true, features = [] } -# Serialization -serde = { workspace = true, features = [] } -schemars = { workspace = true, features = [] } - -# Derive macros and error handling -derive_builder = { workspace = true, features = [] } -thiserror = { workspace = true, features = [] } - # Primitive datatypes (cheap-clone surface form on `Token`) hipstr = { workspace = true, features = [] } +# Async runtime and parallelism +async-trait = { workspace = true, features = [] } + +# Text processing (word-window walk for the substring fallback) +unicode-segmentation = { workspace = true, features = [] } + [dev-dependencies] nvisy-core = { workspace = true, features = ["test-utils"] } diff --git a/crates/nvisy-context/src/declaration.rs b/crates/nvisy-context/src/declaration.rs deleted file mode 100644 index 5984183c..00000000 --- a/crates/nvisy-context/src/declaration.rs +++ /dev/null @@ -1,89 +0,0 @@ -//! [`Context`]: per-source keyword-boost declaration. -//! -//! Carried by anything that declares context — per-rule for -//! patterns (each `Regex`/`Dictionary` may declare one), -//! per-recognizer for NER (a single `default_context` on -//! `NerRecognizer`). The shape is identical regardless of who -//! registers it; the difference is only *what name* gets stored -//! against it in the [`ContextRegistry`]. -//! -//! [`ContextRegistry`]: super::ContextRegistry -//! -//! `window` and `boost` are `Option<_>` so the common case is "I -//! have keywords; use the enhancer's defaults." Override only when -//! the source needs a different policy than the enhancer's global -//! settings. - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -/// Per-source context-boost declaration. -/// -/// Anything that wants to participate in post-recognition keyword -/// boosting registers one of these against its name in a -/// [`ContextRegistry`]. -/// -/// [`ContextRegistry`]: super::ContextRegistry -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, JsonSchema)] -#[serde(rename_all = "camelCase")] -pub struct Context { - /// Keywords whose presence near a match boosts the entity's - /// confidence. Empty list means "registered, but no boost - /// possible" — the enhancer skips this source. - pub keywords: Vec, - /// Override of the enhancer's default window (in bytes on each - /// side of the match). `None` defers to the enhancer's - /// configured default. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub window: Option, - /// Override of the enhancer's default additive boost. `None` - /// defers to the enhancer's configured default. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub boost: Option, -} - -impl Context { - /// Construct with a keyword list. Window and boost default to - /// `None` (use the enhancer's defaults). - #[must_use] - pub fn new(keywords: impl IntoIterator>) -> Self { - Self { - keywords: keywords.into_iter().map(Into::into).collect(), - window: None, - boost: None, - } - } - - /// Override the enhancer's window setting for this source. - #[must_use] - pub fn with_window(mut self, window: usize) -> Self { - self.window = Some(window); - self - } - - /// Override the enhancer's boost setting for this source. - #[must_use] - pub fn with_boost(mut self, boost: f64) -> Self { - self.boost = Some(boost); - self - } - - /// Whether this context carries no boost-eligible keywords. - /// Empty contexts are skipped by the enhancer. - #[must_use] - pub fn is_empty(&self) -> bool { - self.keywords.is_empty() - } -} - -impl> From> for Context { - fn from(keywords: Vec) -> Self { - Self::new(keywords) - } -} - -impl From<[&str; N]> for Context { - fn from(keywords: [&str; N]) -> Self { - Self::new(keywords) - } -} diff --git a/crates/nvisy-context/src/enhancer.rs b/crates/nvisy-context/src/enhancer.rs index ab406d2d..f1eba2df 100644 --- a/crates/nvisy-context/src/enhancer.rs +++ b/crates/nvisy-context/src/enhancer.rs @@ -1,181 +1,191 @@ -//! [`ContextEnhancer`]: post-recognition keyword-boost pass for -//! any [`Entity`] regardless of which recognizer produced it. +//! [`Enhancer`]: post-recognition keyword-boost pass for any +//! [`Entity`] regardless of which recognizer produced it. -use derive_builder::{Builder, UninitializedFieldError}; -use nvisy_core::entity::{Entity, TrailStep}; -use nvisy_core::extraction::Artifacts; +use std::collections::HashMap; + +use nvisy_core::entity::{Entity, EntityLabelRef, TrailStep}; use nvisy_core::modality::Text; -use nvisy_core::primitive::Confidence; +use unicode_segmentation::UnicodeSegmentation; + +use super::matcher::KeywordMatcher; +use super::rule::BoostRule; +use super::tokens::Token; -use super::Tokens; -use super::matcher::{KeywordMatcher, SubstringMatcher}; -use super::registry::ContextRegistry; +/// Source name stamped onto every refinement [`TrailStep`] the +/// enhancer appends. +const TRAIL_SOURCE: &str = "context"; -/// Post-recognition enhancer that boosts entity confidence when -/// keywords declared by the source recognizer appear near the match. +/// Post-recognition enhancer. Holds a label-keyed [`BoostRule`] +/// map plus the keyword-matching strategy, and lifts the +/// confidence of each text entity whose label has a rule and +/// whose surrounding word window contains one of the rule's +/// keywords. /// -/// Construct via [`builder`]. The two required -/// settings are [`default_window`] (in source-text bytes on each -/// side of the match) and [`default_boost`] (the additive bump -/// applied when a keyword fires). Per-source overrides on -/// [`Context::window`] / [`Context::boost`] take precedence. +/// Construct via [`Enhancer::new`]. Rules are passed in by value; +/// duplicates for the same label are merged via +/// [`BoostRule::merge`] (union of keywords; window radii / `boost` +/// kept from the first-seen rule). /// -/// The matcher strategy defaults to [`SubstringMatcher`] when not -/// supplied. Wire [`LemmaMatcher`] instead when an upstream -/// `NlpEngine` populates [`Tokens`] in `RecognizerInput.artifacts` and you -/// want morphological-variant boosting. +/// The matcher defaults are picked by the engine that constructs +/// the enhancer: [`SubstringMatcher`] when no upstream NLP engine +/// produces tokens, [`LemmaMatcher`] when one does. /// -/// [`builder`]: Self::builder -/// [`Context::window`]: super::Context::window -/// [`Context::boost`]: super::Context::boost -/// [`default_window`]: ContextEnhancerBuilder::with_default_window -/// [`default_boost`]: ContextEnhancerBuilder::with_default_boost +/// [`SubstringMatcher`]: super::SubstringMatcher /// [`LemmaMatcher`]: super::LemmaMatcher -/// [`Tokens`]: super::Tokens -#[derive(Builder)] -#[builder( - name = "ContextEnhancerBuilder", - pattern = "owned", - setter(prefix = "with"), - build_fn(error = "ContextEnhancerBuilderError") -)] -pub struct ContextEnhancer { - /// Lookup table built at construction time. The enhancer reads - /// the source-recognizer / rule name off the entity's first - /// recognition step and looks it up here to find the declared - /// [`Context`]. - /// - /// [`Context`]: super::Context - #[builder(setter(custom))] - registry: ContextRegistry, - /// Keyword-matching strategy (substring, lemma, custom). - /// Defaults to [`SubstringMatcher`] when omitted. - #[builder( - setter(custom), - default = "Box::new(SubstringMatcher) as Box" - )] +pub struct Enhancer { + rules: HashMap, matcher: Box, - /// Default window radius (in source-text bytes on each side of - /// the match). Per-source [`Context::window`] overrides this. - /// - /// [`Context::window`]: super::Context::window - default_window: usize, - /// Default additive boost applied when a keyword fires. - /// Per-source [`Context::boost`] overrides this. - /// - /// [`Context::boost`]: super::Context::boost - default_boost: f64, } -impl ContextEnhancer { - /// Start building a `ContextEnhancer`. Required: - /// [`with_registry`], - /// [`with_default_window`], - /// [`with_default_boost`]. +impl Enhancer { + /// Construct from a rule iterator and matcher. Rules sharing + /// the same label are merged via [`BoostRule::merge`]. + pub fn new( + rules: impl IntoIterator, + matcher: Box, + ) -> Self { + let mut map: HashMap = HashMap::new(); + for rule in rules { + match map.get_mut(&rule.label) { + Some(existing) => existing.merge(rule), + None => { + map.insert(rule.label.clone(), rule); + } + } + } + Self { + rules: map, + matcher, + } + } + + /// `true` when no rules are registered. Engine code uses this + /// to short-circuit calls to [`enhance`] entirely. /// - /// [`with_registry`]: ContextEnhancerBuilder::with_registry - /// [`with_default_window`]: ContextEnhancerBuilder::with_default_window - /// [`with_default_boost`]: ContextEnhancerBuilder::with_default_boost + /// [`enhance`]: Self::enhance #[must_use] - pub fn builder() -> ContextEnhancerBuilder { - ContextEnhancerBuilder::default() + pub fn is_empty(&self) -> bool { + self.rules.is_empty() } - /// Borrow the underlying registry. Useful for diagnostics and - /// for engine code that wants to short-circuit when there are - /// no entries to boost against. + /// Number of distinct labels with rules. #[must_use] - pub fn registry(&self) -> &ContextRegistry { - &self.registry + pub fn len(&self) -> usize { + self.rules.len() } - /// Apply context-keyword boosting to `entities` in place. - /// - /// For each entity, looks at its first recognition step's - /// provenance to identify the source name, looks the name up - /// in the [`ContextRegistry`], walks the surrounding window - /// (token-based when [`Tokens`] are present in `artifacts` and - /// the matcher uses tokens, substring-based otherwise), and - /// bumps the confidence by the configured boost — capped at - /// `1.0`. A [`Refinement`] step is appended to the trail, and - /// the recognition step's `contextual` flag is set. + /// Apply boost rules to `entities` in place. For each entity: + /// look up the rule for its label, walk a window of + /// `prefix_words` words before and `suffix_words` words after + /// the entity's location, ask the matcher whether any keyword + /// fires, and on a hit lift confidence by the rule's `boost` + /// (saturating at the [`Confidence`] ceiling) plus append a + /// [`Refinement`] trail step. /// - /// Entities whose source isn't in the registry (or whose - /// declared context has an empty keyword list) pass through - /// unchanged. + /// `tokens` is the optional token artifact produced by an + /// upstream NLP engine. When present, words are counted + /// against the token stream; when absent, words are derived + /// from the source text via Unicode word segmentation. /// + /// [`Confidence`]: nvisy_core::primitive::Confidence /// [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement - pub fn enhance(&self, entities: &mut [Entity], text: &str, artifacts: &Artifacts) { - for entity in entities.iter_mut() { - self.enhance_one(entity, text, artifacts); + pub fn enhance(&self, entities: &mut [Entity], text: &str, tokens: Option<&[Token]>) { + if self.rules.is_empty() { + return; + } + for entity in entities { + self.enhance_one(entity, text, tokens); } } - fn enhance_one(&self, entity: &mut Entity, text: &str, artifacts: &Artifacts) { - let Some(name) = entity - .trail - .first() - .and_then(|s| s.provenance.name()) - .map(str::to_owned) - else { - return; - }; - let Some(ctx) = self.registry.get(&name) else { + fn enhance_one(&self, entity: &mut Entity, text: &str, tokens: Option<&[Token]>) { + let Some(rule) = self.rules.get(&entity.label) else { return; }; - if ctx.keywords.is_empty() { + if rule.keywords.is_empty() { return; } - let window = ctx.window.unwrap_or(self.default_window); - let boost = ctx.boost.unwrap_or(self.default_boost); let start = entity.location.start; let end = entity.location.end; - let snippet = window_around(text, start, end, window); - let tokens_in_window = artifacts - .get::() - .map(|t| t.around(start..end, window)); - // The matcher reads tokens by reference; wrap the in-window - // slice into a temporary owning `Tokens` only when one is - // present. - let owned_tokens; - let tokens_arg = match tokens_in_window { - Some(slice) if !slice.is_empty() => { - owned_tokens = Tokens::new(slice.to_vec()); - Some(&owned_tokens) - } - _ => None, + + // Prefer the token stream when the producer reached this + // entity. Fall back to the word-segmented substring window + // whenever the token slice would be empty — that covers + // `tokens: None`, `tokens: Some(&[])`, and the "tokens + // present but none overlap the entity" case (e.g. NLP + // engine only tokenized part of the document). + let token_slice = tokens + .map(|toks| slice_tokens_around(toks, start, end, rule.prefix_words, rule.suffix_words)) + .unwrap_or(&[]); + let (snippet, tokens_in_window): (&str, &[Token]) = if token_slice.is_empty() { + let snippet = word_window(text, start, end, rule.prefix_words, rule.suffix_words); + (snippet, &[]) + } else { + let snippet = token_span(text, token_slice, start, end); + (snippet, token_slice) }; - if !self.matcher.any_match(snippet, tokens_arg, &ctx.keywords) { + + if !self + .matcher + .any_match(snippet, tokens_in_window, &rule.keywords) + { return; } let original = entity.confidence; - let adjusted_raw = (original.get() + boost).clamp(0.0, 1.0); - let Some(adjusted) = Confidence::new(adjusted_raw) else { + let adjusted = original.saturating_add(rule.boost.get()); + if adjusted == original { return; - }; - entity.confidence = adjusted; - - if let Some(step) = entity.trail.first_mut() { - step.provenance.mark_contextual(); } + entity.confidence = adjusted; entity.trail.push(TrailStep::refinement( - "context-enhancer", + TRAIL_SOURCE, original, adjusted, - format!("context keyword near `{name}` (+{boost})"), + format!( + "context keyword near `{}` (+{:.3})", + entity.label.as_str(), + rule.boost.get(), + ), )); } } -/// Borrow a `window`-radius slice of `text` centered on the entity -/// location, clamped to the string bounds and snapped to UTF-8 -/// character boundaries. -fn window_around(text: &str, start: usize, end: usize, window: usize) -> &str { - let lo = floor_char_boundary(text, start.saturating_sub(window)); - let hi = ceil_char_boundary(text, end.saturating_add(window).min(text.len())); +/// Walk `prefix` words before `[start, end)` and `suffix` words +/// after, via Unicode word segmentation, and return the spanning +/// substring (including any non-word whitespace and punctuation +/// between words). The returned slice covers `[start, end)` itself +/// plus the prefix / suffix words; the entity's own bytes are +/// always inside. +fn word_window(text: &str, start: usize, end: usize, prefix: usize, suffix: usize) -> &str { + let prefix_text = &text[..start.min(text.len())]; + let suffix_text = &text[end.min(text.len())..]; + + // `unicode_word_indices` yields `(byte_offset, word_str)` for + // every "word" (alphanumeric run) in source order. Take the + // last `prefix` on the prefix side, the first `suffix` on the + // suffix side, and compute the spanning byte range. + let prefix_words: Vec<(usize, &str)> = prefix_text.unicode_word_indices().collect(); + let prefix_take = prefix_words.len().saturating_sub(prefix); + let prefix_byte = prefix_words + .get(prefix_take) + .map(|(idx, _)| *idx) + .unwrap_or(start.min(text.len())); + + let suffix_byte = if suffix == 0 { + end.min(text.len()) + } else { + suffix_text + .unicode_word_indices() + .nth(suffix - 1) + .map(|(idx, word)| end + idx + word.len()) + .unwrap_or(text.len()) + }; + + let lo = floor_char_boundary(text, prefix_byte); + let hi = ceil_char_boundary(text, suffix_byte.min(text.len())); &text[lo..hi] } @@ -193,171 +203,389 @@ fn ceil_char_boundary(s: &str, mut pos: usize) -> usize { pos } -impl ContextEnhancerBuilder { - /// Attach the [`ContextRegistry`] the enhancer reads at boost - /// time. Required. - #[must_use] - pub fn with_registry(mut self, registry: ContextRegistry) -> Self { - self.registry = Some(registry); - self +/// Slice tokens by *count*: take `prefix` tokens before the first +/// token overlapping `[start, end)` and `suffix` tokens after the +/// last. The returned slice is contiguous. +fn slice_tokens_around( + tokens: &[Token], + start: usize, + end: usize, + prefix: usize, + suffix: usize, +) -> &[Token] { + if tokens.is_empty() { + return &[]; } - - /// Override the keyword-matching strategy. Defaults to - /// [`SubstringMatcher`]. - #[must_use] - pub fn with_matcher(mut self, matcher: M) -> Self { - self.matcher = Some(Box::new(matcher)); - self + // First token whose `offset.end > start` overlaps or follows the entity. + let first_overlap = tokens.partition_point(|t| t.offset.end <= start); + // One past the last token whose `offset.start < end` overlaps the entity. + let last_overlap = tokens.partition_point(|t| t.offset.start < end); + let lo = first_overlap.saturating_sub(prefix); + let hi = (last_overlap + suffix).min(tokens.len()); + if lo >= hi { + return &[]; } + &tokens[lo..hi] } -/// Error returned by [`ContextEnhancerBuilder::build`]. -#[derive(Debug, thiserror::Error)] -#[error("context enhancer build failed: {0}")] -pub struct ContextEnhancerBuilderError(String); - -impl From for ContextEnhancerBuilderError { - fn from(err: UninitializedFieldError) -> Self { - Self(format!("missing field `{}`", err.field_name())) - } +/// Spanning substring covering `tokens` plus the entity itself. +/// Used to give the matcher a contiguous text window when slicing +/// against the token stream. +/// +/// Precondition: `tokens` is non-empty. Callers must take the +/// `word_window` fallback path when their token slice is empty — +/// see `Enhancer::enhance_one`. +fn token_span<'a>(text: &'a str, tokens: &[Token], start: usize, end: usize) -> &'a str { + debug_assert!(!tokens.is_empty(), "token_span requires non-empty slice"); + let lo = tokens[0].offset.start.min(start); + let hi = tokens[tokens.len() - 1].offset.end.max(end); + let lo = floor_char_boundary(text, lo.min(text.len())); + let hi = ceil_char_boundary(text, hi.min(text.len())); + &text[lo..hi] } #[cfg(test)] mod tests { use nvisy_core::entity::{ - EntityLabelRef, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, - builtins, + EntityLabelRef, PatternProvenance, TrailProvenance, TrailStepKind, builtins, }; - use nvisy_core::extraction::Artifacts; use nvisy_core::modality::{Text, TextLocation}; + use nvisy_core::primitive::Confidence; use super::*; - use crate::Context; - - fn pattern_entity(name: &str, span: std::ops::Range) -> Entity { - let confidence = Confidence::new(0.6).unwrap(); - let provenance = TrailProvenance::Pattern(PatternProvenance::Regex { - name: name.to_owned(), - regex: None, - validator: None, - contextual: false, - }); - let step = TrailStep::recognition( - "pattern", - confidence, - provenance, - format!("pattern `{name}` matched"), - ); - Entity::builder() - .with_label(EntityLabelRef::from(builtins::GOVERNMENT_ID.name.clone())) - .with_trail(vec![step]) - .with_confidence(confidence) - .with_location(TextLocation::new(span.start, span.end)) - .build() - .expect("entity builds") + use crate::SubstringMatcher; + + fn govid_label() -> EntityLabelRef { + builtins::GOVERNMENT_ID.label_ref() } - fn model_entity(name: &str, span: std::ops::Range) -> Entity { - let confidence = Confidence::new(0.5).unwrap(); - let provenance = TrailProvenance::Model(ModelProvenance::new(name)); + fn person_label() -> EntityLabelRef { + builtins::PERSON_NAME.label_ref() + } + + fn entity(label: EntityLabelRef, start: usize, end: usize, score: f64) -> Entity { + let confidence = Confidence::new(score).unwrap(); let step = TrailStep::recognition( - "ner", + "test", confidence, - provenance, - format!("model `{name}` matched"), + TrailProvenance::Pattern(PatternProvenance::DenyList), + "test fixture", ); Entity::builder() - .with_label(EntityLabelRef::from(builtins::PERSON_NAME.name.clone())) + .with_label(label) .with_trail(vec![step]) .with_confidence(confidence) - .with_location(TextLocation::new(span.start, span.end)) + .with_location(TextLocation::new(start, end)) .build() .expect("entity builds") } - fn enhancer(registry: ContextRegistry) -> ContextEnhancer { - ContextEnhancer::builder() - .with_registry(registry) - .with_default_window(80) - .with_default_boost(0.2) - .build() - .expect("enhancer builds") + fn enhancer(rules: Vec) -> Enhancer { + Enhancer::new(rules, Box::new(SubstringMatcher)) + } + + fn rule( + label: EntityLabelRef, + keywords: &[&'static str], + prefix: usize, + suffix: usize, + boost: f64, + ) -> BoostRule { + BoostRule::new( + label, + keywords.iter().copied(), + prefix, + suffix, + Confidence::clamped(boost), + ) } #[test] - fn boosts_pattern_entity_when_keyword_near() { - let registry = - ContextRegistry::new().with_entry("ssn", Context::new(["ssn", "social security"])); - let enhancer = enhancer(registry); + fn boosts_entity_when_keyword_in_word_window() { + let enhancer = enhancer(vec![rule( + govid_label(), + &["ssn", "social security"], + 5, + 5, + 0.2, + )]); let text = "Your SSN: 123-45-6789"; - let mut entities = vec![pattern_entity("ssn", 10..21)]; - let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &Artifacts::new()); - assert!(entities[0].confidence.get() > before); + let mut entities = vec![entity(govid_label(), 10, 21, 0.6)]; + enhancer.enhance(&mut entities, text, None); + assert!(entities[0].confidence.get() > 0.6); assert!( entities[0] .trail .iter() - .any(|s| matches!(s.kind, TrailStepKind::Refinement)) + .any(|s| matches!(s.kind, TrailStepKind::Refinement)), ); - let TrailProvenance::Pattern(PatternProvenance::Regex { contextual, .. }) = - &entities[0].trail[0].provenance - else { - panic!("expected regex provenance"); - }; - assert!(contextual); } #[test] - fn boosts_model_entity_when_keyword_near() { - let registry = - ContextRegistry::new().with_entry("gliner", Context::new(["named", "called", "mr"])); - let enhancer = enhancer(registry); - let text = "Mr. Smith is named in the report."; - let mut entities = vec![model_entity("gliner", 4..9)]; + fn boosts_entity_when_keyword_in_suffix() { + let enhancer = enhancer(vec![rule(govid_label(), &["social"], 0, 5, 0.2)]); + let text = "123-45-6789 (social security number)"; + let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; + enhancer.enhance(&mut entities, text, None); + assert!( + entities[0].confidence.get() > 0.6, + "trailing keyword within suffix window should boost", + ); + } + + #[test] + fn suffix_zero_ignores_trailing_keyword() { + // Prefix-only: trailing keyword must not boost. + let enhancer = enhancer(vec![rule(govid_label(), &["social"], 5, 0, 0.2)]); + let text = "123-45-6789 (social security number)"; + let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &Artifacts::new()); - assert!(entities[0].confidence.get() > before); - let TrailProvenance::Model(prov) = &entities[0].trail[0].provenance else { - panic!("expected model provenance"); - }; - assert!(prov.contextual); + enhancer.enhance(&mut entities, text, None); + assert_eq!(entities[0].confidence.get(), before); } #[test] - fn skips_entity_with_no_registered_source() { - let registry = ContextRegistry::new(); - let enhancer = enhancer(registry); - let text = "Your SSN: 123-45-6789"; - let mut entities = vec![pattern_entity("ssn", 10..21)]; + fn skips_entity_with_no_rule_for_label() { + let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); + let text = "Mr. Smith is named in the report."; + let mut entities = vec![entity(person_label(), 4, 9, 0.5)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &Artifacts::new()); + enhancer.enhance(&mut entities, text, None); assert_eq!(entities[0].confidence.get(), before); } #[test] - fn per_source_window_overrides_default() { - let registry = - ContextRegistry::new().with_entry("far", Context::new(["far_keyword"]).with_window(5)); - let enhancer = enhancer(registry); - let text = "far_keyword XYZ here"; - let mut entities = vec![pattern_entity("far", 39..42)]; + fn window_bounds_the_search() { + // 2-word prefix / 2-word suffix: "far_keyword" is at the + // start; the entity is after many filler words. + let enhancer = enhancer(vec![rule(govid_label(), &["far_keyword"], 2, 2, 0.2)]); + let text = "far_keyword here is some filler between the keyword and XYZ here"; + let xyz_start = text.find("XYZ").unwrap(); + let xyz_end = xyz_start + "XYZ".len(); + let mut entities = vec![entity(govid_label(), xyz_start, xyz_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, &Artifacts::new()); + enhancer.enhance(&mut entities, text, None); assert_eq!(entities[0].confidence.get(), before); } #[test] - fn boost_caps_at_one() { - let registry = - ContextRegistry::new().with_entry("high", Context::new(["here"]).with_boost(0.9)); - let enhancer = enhancer(registry); + fn boost_saturates_at_one() { + let enhancer = enhancer(vec![rule(govid_label(), &["here"], 5, 5, 0.9)]); let text = "the value is right here in plain sight"; - let mut entity = pattern_entity("high", 16..21); - // Push base confidence to 0.95 - entity.confidence = Confidence::new(0.95).unwrap(); - let mut entities = vec![entity]; - enhancer.enhance(&mut entities, text, &Artifacts::new()); + let mut entities = vec![entity(govid_label(), 16, 21, 0.95)]; + enhancer.enhance(&mut entities, text, None); assert!((entities[0].confidence.get() - 1.0).abs() < f64::EPSILON); } + + #[test] + fn duplicate_label_rules_merge_keywords() { + // Two rules for the same label, each contributing a + // distinct keyword. The merged rule must trigger boosts + // for matches near keywords from *either* original source, + // proving the keyword union survived the merge (not just + // last-write-wins). + let make_enhancer = || { + enhancer(vec![ + rule(govid_label(), &["ssn"], 5, 5, 0.2), + rule(govid_label(), &["tax id"], 5, 5, 0.2), + ]) + }; + assert_eq!(make_enhancer().len(), 1); + + // Keyword only from the first rule. + let ssn_only = "ssn: 123-45-6789"; + let ssn_entity_start = ssn_only.find("123").unwrap(); + let ssn_entity_end = ssn_entity_start + "123-45-6789".len(); + let mut from_first = vec![entity(govid_label(), ssn_entity_start, ssn_entity_end, 0.6)]; + make_enhancer().enhance(&mut from_first, ssn_only, None); + assert!( + from_first[0].confidence.get() > 0.6, + "keyword `ssn` from the first rule must still boost after merge", + ); + + // Keyword only from the second rule. + let taxid_only = "tax id: 987-65-4329"; + let tax_entity_start = taxid_only.find("987").unwrap(); + let tax_entity_end = tax_entity_start + "987-65-4329".len(); + let mut from_second = vec![entity(govid_label(), tax_entity_start, tax_entity_end, 0.6)]; + make_enhancer().enhance(&mut from_second, taxid_only, None); + assert!( + from_second[0].confidence.get() > 0.6, + "keyword `tax id` from the second rule must still boost after merge", + ); + } + + #[test] + fn word_window_handles_unicode() { + // 3-word prefix reaches "café" past "naïve" and "resume". + let enhancer = enhancer(vec![rule(govid_label(), &["café"], 3, 0, 0.2)]); + let text = "café naïve resume — 123-45-6789"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + enhancer.enhance(&mut entities, text, None); + assert!( + entities[0].confidence.get() > 0.6, + "unicode word should be reachable within 3-word prefix", + ); + } + + #[test] + fn word_window_excludes_too_distant_unicode() { + // 2-word prefix: "café" is the 3rd word before the entity. + let enhancer = enhancer(vec![rule(govid_label(), &["café"], 2, 0, 0.2)]); + let text = "café naïve resume — 123-45-6789"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + let before = entities[0].confidence.get(); + enhancer.enhance(&mut entities, text, None); + assert_eq!(entities[0].confidence.get(), before); + } + + #[test] + fn empty_tokens_slice_matches_none_behaviour() { + // Keyword sits in the prefix word-window but outside the + // entity bytes. With the empty-slice fix, `Some(&[])` must + // not collapse the snippet to the entity bytes — it should + // fall back to the word-window path just like `None`. + let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); + let text = "Your SSN: 123-45-6789"; + let mut from_none = vec![entity(govid_label(), 10, 21, 0.6)]; + let mut from_empty = vec![entity(govid_label(), 10, 21, 0.6)]; + enhancer.enhance(&mut from_none, text, None); + enhancer.enhance(&mut from_empty, text, Some(&[])); + assert_eq!( + from_none[0].confidence.get(), + from_empty[0].confidence.get(), + "Some(&[]) must behave identically to None", + ); + assert!( + from_empty[0].confidence.get() > 0.6, + "empty tokens slice must still allow the word-window fallback to boost", + ); + } + + #[test] + fn token_path_counts_words_against_token_stream() { + // 1-word prefix, 0-word suffix: the only word the + // prefix reaches is the immediate predecessor token + // "Your". The tokenizer here treats "social security" + // as a single compound token outside the window, so the + // keyword "social security" must NOT fire — unlike a + // hypothetical caller that gave it the word-window path, + // which would split on whitespace. + let enhancer = enhancer(vec![rule( + govid_label(), + &["social security"], + 1, + 0, + 0.2, + )]); + let text = "social security: Your 123-45-6789"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + let tokens: Vec = vec![ + Token::from_text("social security", 0..15), + Token::from_text("Your", 17..21), + Token::from_text("123-45-6789", 22..33), + ]; + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + let before = entities[0].confidence.get(); + enhancer.enhance(&mut entities, text, Some(&tokens)); + assert_eq!( + entities[0].confidence.get(), + before, + "1-word prefix should not reach the `social security` token two positions back", + ); + } + + #[test] + fn token_path_boosts_when_keyword_within_token_window() { + // Same tokens, 2-word prefix: now the `social security` + // token is reachable and the boost fires. + let enhancer = enhancer(vec![rule( + govid_label(), + &["social security"], + 2, + 0, + 0.2, + )]); + let text = "social security: Your 123-45-6789"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + let tokens: Vec = vec![ + Token::from_text("social security", 0..15), + Token::from_text("Your", 17..21), + Token::from_text("123-45-6789", 22..33), + ]; + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + enhancer.enhance(&mut entities, text, Some(&tokens)); + assert!( + entities[0].confidence.get() > 0.6, + "2-word prefix should reach the `social security` token", + ); + } + + #[test] + fn lemma_matcher_boosts_on_morphological_variant() { + // Substring matcher would miss `running` for keyword + // `run`. Lemma matcher reads the lemma directly off the + // token and boosts. + let enhancer = Enhancer::new( + vec![rule(govid_label(), &["run"], 5, 5, 0.2)], + Box::new(crate::LemmaMatcher), + ); + let text = "They were running 123-45-6789 across the system"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + let tokens: Vec = vec![ + Token::from_text("They", 0..4), + Token::from_text("were", 5..9), + Token::from_text("running", 10..17).with_lemma("run"), + Token::from_text("123-45-6789", 18..29), + Token::from_text("across", 30..36), + Token::from_text("the", 37..40), + Token::from_text("system", 41..47), + ]; + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + enhancer.enhance(&mut entities, text, Some(&tokens)); + assert!( + entities[0].confidence.get() > 0.6, + "lemma matcher should match `run` against the `running` token's lemma", + ); + assert!( + entities[0] + .trail + .iter() + .any(|s| matches!(s.kind, TrailStepKind::Refinement)), + ); + } + + #[test] + fn tokens_with_no_overlap_fall_back_to_word_window() { + // Tokens cover the first half of the document; the entity + // is in the second half, outside any token's range. + // Without the fallback the token slice would be empty and + // the snippet would collapse to entity bytes. With the + // fallback, the word-window path reaches the keyword. + let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); + let text = "First half of the document. Your SSN: 123-45-6789"; + let entity_start = text.find("123").unwrap(); + let entity_end = entity_start + "123-45-6789".len(); + // Tokens that cover only the first sentence. + let tokens: Vec = vec![ + Token::from_text("First", 0..5), + Token::from_text("half", 6..10), + Token::from_text("of", 11..13), + Token::from_text("the", 14..17), + Token::from_text("document", 18..26), + ]; + let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; + enhancer.enhance(&mut entities, text, Some(&tokens)); + assert!( + entities[0].confidence.get() > 0.6, + "tokens that don't overlap the entity must fall back to the word window", + ); + } } diff --git a/crates/nvisy-context/src/lib.rs b/crates/nvisy-context/src/lib.rs index 2004d7c6..192796f2 100644 --- a/crates/nvisy-context/src/lib.rs +++ b/crates/nvisy-context/src/lib.rs @@ -2,14 +2,14 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![doc = include_str!("../README.md")] -mod declaration; mod enhancer; mod matcher; -mod registry; +mod rule; mod tokens; +mod wrapper; -pub use self::declaration::Context; -pub use self::enhancer::{ContextEnhancer, ContextEnhancerBuilder, ContextEnhancerBuilderError}; +pub use self::enhancer::Enhancer; pub use self::matcher::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; -pub use self::registry::ContextRegistry; +pub use self::rule::{BoostRule, DEFAULT_BOOST, DEFAULT_PREFIX_WORDS, DEFAULT_SUFFIX_WORDS}; pub use self::tokens::{Token, Tokens}; +pub use self::wrapper::Boosting; diff --git a/crates/nvisy-context/src/matcher.rs b/crates/nvisy-context/src/matcher.rs index 4cc939ac..a2cdb3c3 100644 --- a/crates/nvisy-context/src/matcher.rs +++ b/crates/nvisy-context/src/matcher.rs @@ -1,72 +1,68 @@ //! [`KeywordMatcher`] strategy + the two shipped implementations. //! //! - [`SubstringMatcher`] — ASCII case-insensitive substring search -//! over the raw text window. The fallback when no [`Tokens`] are -//! present in `RecognizerInput.artifacts`. +//! over the raw text window. The fallback when no token artifact +//! is present on `RecognizerInput.artifacts`. //! - [`LemmaMatcher`] — matches keywords against lemmatized tokens -//! stamped on `RecognizerInput.artifacts` as a [`Tokens`] entry by an -//! upstream NLP engine. Recognizes morphological variants -//! ("running" → "run", "SSNs" → "ssn") that substring matching -//! misses, at the cost of needing a producer engine with -//! lemmatization. +//! the upstream NLP engine stamped on `RecognizerInput.artifacts` +//! as a [`Tokens`] entry. Recognizes morphological variants +//! ("running" → "run", "SSNs" → "ssn") substring matching misses. //! -//! Both implementations are stateless; the -//! [`ContextEnhancer`] owns one as a -//! configured strategy. +//! Both implementations are stateless; the [`Enhancer`] owns one +//! as a configured strategy. //! //! [`Tokens`]: super::Tokens -//! [`ContextEnhancer`]: super::ContextEnhancer +//! [`Enhancer`]: super::Enhancer -use super::Tokens; +use hipstr::HipStr; -/// Decide whether any keyword from `keywords` fires within `window`. +use super::Token; + +/// Decide whether any keyword from `keywords` fires within the +/// candidate region around an entity match. /// -/// The trait is the strategy slot that lets the enhancer swap raw -/// substring matching for lemma-aware matching (or a third-party +/// The strategy slot that lets the enhancer swap raw substring +/// matching for lemma-aware matching (or a third-party /// fuzzy/word-boundary implementation) without changing its core /// pipeline. /// /// Implementations receive both a raw `window` slice of the source -/// text (for substring strategies) and an optional `tokens` view -/// (for token/lemma strategies). Either or both may be ignored. +/// text (for substring strategies) and the `tokens` covering that +/// same range (for token/lemma strategies). Either or both may be +/// ignored; `tokens` is empty when no NLP engine produced a token +/// artifact. pub trait KeywordMatcher: Send + Sync { /// `true` if at least one keyword from `keywords` appears in - /// the input. `window` is the raw text slice surrounding the - /// entity match; `tokens` is the subset of [`Tokens`] covering - /// that same range when an upstream NLP engine produced one, - /// `None` otherwise. - /// - /// [`Tokens`]: super::Tokens - fn any_match(&self, window: &str, tokens: Option<&Tokens>, keywords: &[String]) -> bool; + /// the input. + fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool; } -/// ASCII case-insensitive substring matcher. The default — used -/// whenever no [`Tokens`] were stamped on `RecognizerInput.artifacts`, or -/// whenever the caller explicitly picks raw matching. +/// ASCII case-insensitive substring matcher. The default — +/// runs whenever no token artifact was stamped on +/// `RecognizerInput.artifacts`, or whenever the caller explicitly +/// picks raw matching. /// /// Fast, allocation-light, permissive: the keyword `"email"` fires /// inside `"MyEmailAddress"`. Ignores the `tokens` argument. -/// -/// [`Tokens`]: super::Tokens #[derive(Debug, Clone, Copy, Default)] pub struct SubstringMatcher; impl KeywordMatcher for SubstringMatcher { - fn any_match(&self, window: &str, _tokens: Option<&Tokens>, keywords: &[String]) -> bool { + fn any_match(&self, window: &str, _tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { let lowered = window.to_ascii_lowercase(); keywords .iter() - .any(|kw| lowered.contains(&kw.to_ascii_lowercase())) + .any(|kw| lowered.contains(kw.as_str().to_ascii_lowercase().as_str())) } } -/// Lemma-aware matcher. Compares each lemma in `tokens` against the -/// keyword list with ASCII case-insensitive equality. +/// Lemma-aware matcher. Compares each lemma in `tokens` against +/// the keyword list with ASCII case-insensitive equality. /// /// Falls back to [`SubstringMatcher`] semantics when `tokens` is -/// `None` (no shared NLP artifact was produced) so the enhancer -/// can be wired uniformly regardless of whether a given scan had -/// artifacts. +/// empty (no shared NLP artifact was produced) so the enhancer +/// runs uniformly regardless of whether the upstream pass emitted +/// tokens. /// /// Recognizes morphological variants the substring matcher cannot: /// `"running" → "run"`, `"dogs" → "dog"`, `"SSNs" → "ssn"`. Cost @@ -76,60 +72,59 @@ impl KeywordMatcher for SubstringMatcher { pub struct LemmaMatcher; impl KeywordMatcher for LemmaMatcher { - fn any_match(&self, window: &str, tokens: Option<&Tokens>, keywords: &[String]) -> bool { - let Some(tokens) = tokens else { - return SubstringMatcher.any_match(window, None, keywords); - }; - let lowered_keywords: Vec = - keywords.iter().map(|k| k.to_ascii_lowercase()).collect(); + fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { + if tokens.is_empty() { + return SubstringMatcher.any_match(window, tokens, keywords); + } + let lowered_keywords: Vec = keywords + .iter() + .map(|k| k.as_str().to_ascii_lowercase()) + .collect(); tokens.iter().any(|tok| { - let lemma = tok.lemma.to_ascii_lowercase(); - lowered_keywords.iter().any(|kw| kw == &lemma) + let lemma = tok.lemma.as_str().to_ascii_lowercase(); + lowered_keywords.contains(&lemma) }) } } #[cfg(test)] mod tests { - use super::super::Token; use super::*; + fn kws(items: &[&'static str]) -> Vec> { + items.iter().copied().map(HipStr::from).collect() + } + #[test] fn substring_matches_case_insensitively() { let m = SubstringMatcher; - assert!(m.any_match("Your SSN: 123", None, &["ssn".into()])); - assert!(m.any_match( - "the SOCIAL SECURITY number", - None, - &["social security".into()] - )); - assert!(!m.any_match("nothing here", None, &["ssn".into()])); + assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); + assert!(m.any_match("the SOCIAL SECURITY number", &[], &kws(&["social security"]))); + assert!(!m.any_match("nothing here", &[], &kws(&["ssn"]))); } #[test] fn substring_is_permissive() { let m = SubstringMatcher; - assert!(m.any_match("MyEmailAddress", None, &["email".into()])); + assert!(m.any_match("MyEmailAddress", &[], &kws(&["email"]))); } #[test] fn lemma_matches_morph_variants() { - // tokens with lemmatization: "running" → "run", "dogs" → "dog" - let tokens = Tokens::new(vec![ + let tokens = vec![ Token::from_text("the", 0..3), Token::from_text("running", 4..11).with_lemma("run"), Token::from_text("dogs", 12..16).with_lemma("dog"), - ]); + ]; let m = LemmaMatcher; - assert!(m.any_match("", Some(&tokens), &["run".into()])); - assert!(m.any_match("", Some(&tokens), &["dog".into()])); - assert!(!m.any_match("", Some(&tokens), &["cat".into()])); + assert!(m.any_match("", &tokens, &kws(&["run"]))); + assert!(m.any_match("", &tokens, &kws(&["dog"]))); + assert!(!m.any_match("", &tokens, &kws(&["cat"]))); } #[test] fn lemma_falls_back_to_substring_without_tokens() { let m = LemmaMatcher; - // No artifacts → fall back to substring matching. - assert!(m.any_match("Your SSN: 123", None, &["ssn".into()])); + assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); } } diff --git a/crates/nvisy-context/src/registry.rs b/crates/nvisy-context/src/registry.rs deleted file mode 100644 index d0043bc9..00000000 --- a/crates/nvisy-context/src/registry.rs +++ /dev/null @@ -1,122 +0,0 @@ -//! [`ContextRegistry`]: the `name → Context` lookup the enhancer -//! reads at boost time. -//! -//! The recognizer side of the pipeline (`PatternRecognizer`, -//! `NerRecognizer`, …) registers one entry per -//! source name — for patterns that's one entry per -//! `Regex`/`Dictionary` rule; for NER it's typically one entry per -//! recognizer keyed on the recognizer's name. The enhancer reads -//! the entity's first-step provenance, pulls the name, and looks -//! up the [`Context`] here. -//! -//! Last-write-wins on duplicate names: callers are responsible for -//! choosing distinct keys when mixing per-rule and per-recognizer -//! registrations. - -use std::collections::HashMap; - -use super::Context; - -/// Lookup table the [`ContextEnhancer`] -/// reads at boost time. -/// -/// Construct with [`new`], populate with -/// [`with_entry`] / -/// [`with_entries`], then hand to a -/// [`ContextEnhancerBuilder`]. -/// -/// [`ContextEnhancer`]: super::ContextEnhancer -/// [`new`]: Self::new -/// [`with_entry`]: Self::with_entry -/// [`with_entries`]: Self::with_entries -/// [`ContextEnhancerBuilder`]: super::ContextEnhancerBuilder -#[derive(Debug, Clone, Default)] -pub struct ContextRegistry { - entries: HashMap, -} - -impl ContextRegistry { - /// Empty registry. - #[must_use] - pub fn new() -> Self { - Self::default() - } - - /// Register one entry. Last write wins on duplicate names. - #[must_use] - pub fn with_entry(mut self, name: impl Into, context: Context) -> Self { - let context_name = name.into(); - if !context.is_empty() { - self.entries.insert(context_name, context); - } - self - } - - /// Register many entries. - #[must_use] - pub fn with_entries(mut self, entries: I) -> Self - where - I: IntoIterator, - S: Into, - { - for (name, context) in entries { - let context_name = name.into(); - if !context.is_empty() { - self.entries.insert(context_name, context); - } - } - self - } - - /// Merge another registry into this one. Last-write-wins on - /// duplicate names. Used to combine per-source registries (e.g. - /// pattern registry + NER registry) into one enhancer input. - #[must_use] - pub fn merge(mut self, other: ContextRegistry) -> Self { - for (name, context) in other.entries { - self.entries.insert(name, context); - } - self - } - - /// Look up the [`Context`] for `name`. Returns `None` when the - /// name was never registered or when the registered context - /// had an empty keyword list (which is treated as "not - /// registered" — see [`with_entry`]). - /// - /// [`with_entry`]: Self::with_entry - #[must_use] - pub fn get(&self, name: &str) -> Option<&Context> { - self.entries.get(name) - } - - /// Number of registered names with non-empty contexts. - #[must_use] - pub fn len(&self) -> usize { - self.entries.len() - } - - /// Whether the registry has no entries. - #[must_use] - pub fn is_empty(&self) -> bool { - self.entries.is_empty() - } -} - -impl Extend<(String, Context)> for ContextRegistry { - fn extend>(&mut self, iter: I) { - for (name, context) in iter { - if !context.is_empty() { - self.entries.insert(name, context); - } - } - } -} - -impl FromIterator<(String, Context)> for ContextRegistry { - fn from_iter>(iter: I) -> Self { - let mut registry = Self::new(); - registry.extend(iter); - registry - } -} diff --git a/crates/nvisy-context/src/rule.rs b/crates/nvisy-context/src/rule.rs new file mode 100644 index 00000000..7f88cf78 --- /dev/null +++ b/crates/nvisy-context/src/rule.rs @@ -0,0 +1,140 @@ +//! [`BoostRule`]: per-label keyword-boost rule. +//! +//! One rule per [`EntityLabelRef`] declares the keyword set that +//! lifts confidence when one of those keywords appears within +//! `prefix_words` words before or `suffix_words` words after an +//! entity carrying that label. The window radii and the additive +//! `boost` are resolved at rule construction time — there are no +//! per-source overrides at apply time. +//! +//! Producers (the pattern crate today, future NER/LLM/custom +//! recognizer authors) hand the engine a `Vec` keyed by +//! label. When several rules contribute to the same label (e.g. +//! two different SSN detectors both contributing to +//! `GOVERNMENT_ID`), the engine merges them by union of keywords — +//! see [`BoostRule::merge`]. +//! +//! [`EntityLabelRef`]: nvisy_core::entity::EntityLabelRef + +use std::collections::HashSet; + +use hipstr::HipStr; +use nvisy_core::entity::EntityLabelRef; +use nvisy_core::primitive::Confidence; + +/// Default window radius in words *before* an entity match. +/// Mirrors Presidio's `context_prefix_count = 5`. +pub const DEFAULT_PREFIX_WORDS: usize = 5; + +/// Default window radius in words *after* an entity match. Set +/// equal to [`DEFAULT_PREFIX_WORDS`] so trailing context like +/// "123-45-6789 (social security)" boosts the same as leading +/// context. Presidio defaults `context_suffix_count` to `0`; we +/// pick symmetric defaults because operators rarely realize the +/// asymmetry exists, and one-sided windows surprise people. +pub const DEFAULT_SUFFIX_WORDS: usize = 5; + +/// Default additive boost applied when a keyword fires. Matches +/// Presidio's `context_similarity_factor = 0.35`. +pub const DEFAULT_BOOST: f64 = 0.35; + +/// Per-label boost rule the [`Enhancer`] applies at runtime. +/// +/// [`Enhancer`]: super::Enhancer +#[derive(Debug, Clone, PartialEq)] +pub struct BoostRule { + /// Entity label this rule applies to. Each emitted + /// `Entity` whose [`label`] matches is checked against + /// this rule's keywords. + /// + /// [`label`]: nvisy_core::entity::Entity::label + pub label: EntityLabelRef, + /// Keywords whose presence near a match lifts the entity's + /// confidence. Stored as [`HipStr`] for cheap clones across + /// per-pass rule sets. + pub keywords: Vec>, + /// Window radius in words *before* the entity's match. + /// Counted against the token artifact on + /// `RecognizerInput.artifacts` when present, or via Unicode + /// word segmentation of the source text otherwise. + pub prefix_words: usize, + /// Window radius in words *after* the entity's match. Same + /// source as [`prefix_words`]. + /// + /// [`prefix_words`]: Self::prefix_words + pub suffix_words: usize, + /// Additive boost applied to the entity's confidence when a + /// keyword fires. Clamped at the [`Confidence`] ceiling on + /// apply. + pub boost: Confidence, +} + +impl BoostRule { + /// Construct a rule for `label` with explicit window radii + /// and `boost`. Most callers want [`BoostRule::for_label`] + /// instead — it bakes in the default window / boost values. + #[must_use] + pub fn new( + label: EntityLabelRef, + keywords: impl IntoIterator>>, + prefix_words: usize, + suffix_words: usize, + boost: Confidence, + ) -> Self { + Self { + label, + keywords: keywords.into_iter().map(Into::into).collect(), + prefix_words, + suffix_words, + boost, + } + } + + /// Construct a rule for `label` using the crate's default + /// [`prefix_words`], [`suffix_words`], and [`boost`] + /// constants. The common case — recognizers building their + /// own boost rules from declared keywords don't need to + /// think about tuning knobs. + /// + /// [`prefix_words`]: DEFAULT_PREFIX_WORDS + /// [`suffix_words`]: DEFAULT_SUFFIX_WORDS + /// [`boost`]: DEFAULT_BOOST + #[must_use] + pub fn for_label( + label: EntityLabelRef, + keywords: impl IntoIterator>>, + ) -> Self { + Self::new( + label, + keywords, + DEFAULT_PREFIX_WORDS, + DEFAULT_SUFFIX_WORDS, + Confidence::clamped(DEFAULT_BOOST), + ) + } + + /// Merge `other` into this rule by extending the keyword set + /// with any keywords not already present. Window radii and + /// `boost` are kept from `self` — callers that need different + /// values per source should construct independent rules and + /// keep them separate. + /// + /// # Panics + /// + /// Debug-asserts when the labels differ. Merging across labels + /// is a caller bug — rules are keyed by label and the engine + /// looks them up by label. + pub fn merge(&mut self, other: BoostRule) { + debug_assert_eq!( + self.label, other.label, + "BoostRule::merge requires matching labels", + ); + let existing: HashSet<&str> = self.keywords.iter().map(HipStr::as_str).collect(); + let additions: Vec> = other + .keywords + .into_iter() + .filter(|kw| !existing.contains(kw.as_str())) + .collect(); + self.keywords.extend(additions); + } +} diff --git a/crates/nvisy-context/src/tokens.rs b/crates/nvisy-context/src/tokens.rs index 24181797..eb490adc 100644 --- a/crates/nvisy-context/src/tokens.rs +++ b/crates/nvisy-context/src/tokens.rs @@ -6,33 +6,29 @@ //! text, and two precomputed predicates the enhancer reads //! (`is_stop`, `is_punct`). //! -//! [`Tokens`] is the owning collection plus lookup helpers the -//! enhancer uses: [`around`] gets the slice of tokens within a byte -//! window, [`lemmas_in`] iterates lemmas covering a byte range. -//! Both work in *source-text byte offsets* — the same coordinate -//! space as [`Entity::location`] — so there's no coordinate -//! translation at the call site. +//! [`Tokens`] is the owning collection — a `Vec` newtype +//! exposing iteration and length. The [`Enhancer`] slices the +//! stream by *count* (prefix/suffix word radii) using its own +//! internal helpers; the byte range carried on each [`Token`] is +//! there for consumers that want to map a token back to its +//! source-text substring. //! -//! [`around`]: Tokens::around -//! [`lemmas_in`]: Tokens::lemmas_in -//! [`Entity::location`]: nvisy_core::entity::Entity::location +//! [`Enhancer`]: super::Enhancer //! -//! Tokens live next to the [`ContextEnhancer`] because that's the -//! only consumer: the enhancer reads them off +//! Tokens live next to the [`Enhancer`] because that's the only +//! consumer: the enhancer reads them off //! `RecognizerInput::artifacts` to drive lemma-aware keyword //! matching. The producer (a tokenizer in some upstream NLP //! backend) only needs to know the type by name; the type itself -//! belongs in the consumer's neighborhood. +//! belongs in the consumer's neighbourhood. //! //! The shape is intentionally minimal. POS tags, morphology, -//! dependency trees, and other heavier features are not part of the -//! v1 surface; they get added as fields when a downstream consumer -//! needs them. This keeps the artifact cheap for engines that don't -//! produce them — `text == lemma`, `is_stop == false`, +//! dependency trees, and other heavier features are not part of +//! the v1 surface; they get added as fields when a downstream +//! consumer needs them. This keeps the artifact cheap for engines +//! that don't produce them — `text == lemma`, `is_stop == false`, //! `is_punct == false` are the defaults for a tokenizer-only //! engine. -//! -//! [`ContextEnhancer`]: super::ContextEnhancer use std::ops::Range; @@ -99,17 +95,18 @@ impl Token { } } -/// The owning token sequence carried by a -/// [`RecognizerInput::artifacts`] bundle. +/// Owning token sequence stamped on a +/// [`RecognizerInput::artifacts`] bundle by an upstream NLP engine. /// /// [`RecognizerInput::artifacts`]: nvisy_core::recognition::RecognizerInput::artifacts /// /// Tokens are sorted by `offset.start` (producers should emit them -/// in order; consumer-side code assumes this). The collection -/// exposes byte-range lookup helpers the [`ContextEnhancer`] uses -/// to pull lemmas around an entity match. +/// in order; consumer-side code assumes this). The [`Enhancer`] +/// borrows the underlying slice via [`as_slice`] and walks it by +/// count when scoring the entity's neighbourhood. /// -/// [`ContextEnhancer`]: super::ContextEnhancer +/// [`Enhancer`]: super::Enhancer +/// [`as_slice`]: Tokens::as_slice #[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct Tokens(Vec); @@ -150,47 +147,6 @@ impl Tokens { pub fn iter(&self) -> std::slice::Iter<'_, Token> { self.0.iter() } - - /// Tokens overlapping `byte_range`, plus a `window`-byte - /// margin on each side. - /// - /// Used by the enhancer to grab the keyword neighborhood around - /// an entity match. Returns the contiguous sub-slice; tokens at - /// the boundary are included when their byte range overlaps the - /// expanded range. - /// - /// Cost is `O(log n)` for the start probe + linear over the - /// returned slice; the sequence is sorted so a binary search - /// suffices. - #[must_use] - pub fn around(&self, byte_range: Range, window: usize) -> &[Token] { - let lo = byte_range.start.saturating_sub(window); - let hi = byte_range.end.saturating_add(window); - self.in_range(lo..hi) - } - - /// Lemmas of every token overlapping `byte_range`. Useful when - /// only the lemma strings are needed (e.g. for keyword - /// matching). - pub fn lemmas_in(&self, byte_range: Range) -> impl Iterator { - self.in_range(byte_range).iter().map(|t| t.lemma.as_str()) - } - - /// Tokens fully contained within (or overlapping) `byte_range`. - /// Returned as a sub-slice — tokens with `offset.end > range.start` - /// and `offset.start < range.end` are included. - #[must_use] - pub fn in_range(&self, byte_range: Range) -> &[Token] { - if self.0.is_empty() || byte_range.start >= byte_range.end { - return &[]; - } - let start = self.0.partition_point(|t| t.offset.end <= byte_range.start); - let end = self.0.partition_point(|t| t.offset.start < byte_range.end); - if start >= end { - return &[]; - } - &self.0[start..end] - } } impl FromIterator for Tokens { @@ -207,68 +163,3 @@ impl IntoIterator for Tokens { self.0.into_iter() } } - -#[cfg(test)] -mod tests { - use super::*; - - fn t(text: &'static str, start: usize, end: usize) -> Token { - Token::from_text(text, start..end) - } - - #[test] - fn in_range_returns_overlapping_tokens() { - let tokens = Tokens::new(vec![t("hello", 0, 5), t("world", 6, 11), t("foo", 12, 15)]); - // 4..7 overlaps "hello" and "world" - let got: Vec<&str> = tokens - .in_range(4..7) - .iter() - .map(|t| t.text.as_str()) - .collect(); - assert_eq!(got, vec!["hello", "world"]); - } - - #[test] - fn around_extends_by_window() { - let tokens = Tokens::new(vec![ - t("a", 0, 1), - t("b", 2, 3), - t("c", 4, 5), - t("d", 6, 7), - t("e", 8, 9), - ]); - // around 4..5 with window=2 → look at 2..7 → "b","c","d" - let got: Vec<&str> = tokens - .around(4..5, 2) - .iter() - .map(|t| t.text.as_str()) - .collect(); - assert_eq!(got, vec!["b", "c", "d"]); - } - - #[test] - fn lemmas_in_yields_lemmas() { - let tokens = Tokens::new(vec![ - t("running", 0, 7).with_lemma("run"), - t("dogs", 8, 12).with_lemma("dog"), - ]); - let got: Vec<&str> = tokens.lemmas_in(0..12).collect(); - assert_eq!(got, vec!["run", "dog"]); - } - - #[test] - fn in_range_empty_for_disjoint_range() { - let tokens = Tokens::new(vec![t("a", 0, 5)]); - assert!(tokens.in_range(10..20).is_empty()); - } - - #[test] - fn in_range_empty_for_inverted_range() { - let tokens = Tokens::new(vec![t("a", 0, 5)]); - let inverted = Range { - start: 5usize, - end: 3usize, - }; - assert!(tokens.in_range(inverted).is_empty()); - } -} diff --git a/crates/nvisy-context/src/wrapper.rs b/crates/nvisy-context/src/wrapper.rs new file mode 100644 index 00000000..688f2838 --- /dev/null +++ b/crates/nvisy-context/src/wrapper.rs @@ -0,0 +1,77 @@ +//! [`Boosting`]: post-recognition keyword-boost wrapper for any +//! [`EntityRecognizer`]. +//! +//! Composes an inner recognizer with an [`Enhancer`]: the wrapper +//! delegates `recognize` to the inner, then runs the enhancer +//! over the produced entities. Equivalent to "the recognizer +//! owns its boosting" without each recognizer reimplementing the +//! enhancement step. +//! +//! Typical use: +//! +//! ```ignore +//! let inner = MyRecognizer::new(...); +//! let enhancer = Enhancer::new(rules, Box::new(SubstringMatcher)); +//! let recognizer = Boosting::new(inner, enhancer); +//! ``` +//! +//! The wrapper implements [`EntityRecognizer`] so the engine +//! never has to know boosting happened. + +use nvisy_core::Result; +use nvisy_core::modality::Text; +use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; + +use super::Enhancer; +use super::Tokens; + +/// Wraps an [`EntityRecognizer`] with a post-recognition +/// [`Enhancer`] pass. Implements [`EntityRecognizer`] so +/// the wrapped recognizer is a drop-in replacement. +/// +/// Assumes the inner recognizer emits entities whose byte offsets +/// index into `input.data.text` (the standard +/// [`EntityRecognizer`] contract). The wrapper reads the +/// same `&str` for the keyword-window walk; a recognizer that +/// emitted entities relative to a different coordinate space +/// would surface stale or panic-on-slice offsets. +pub struct Boosting { + inner: R, + enhancer: Enhancer, +} + +impl Boosting { + /// Wrap `inner` with `enhancer`. After `recognize` produces + /// entities, `enhancer` runs over them in place. + pub fn new(inner: R, enhancer: Enhancer) -> Self { + Self { inner, enhancer } + } + + /// Borrow the wrapped recognizer. + pub fn inner(&self) -> &R { + &self.inner + } + + /// Borrow the enhancer applied to the inner recognizer's + /// output. + pub fn enhancer(&self) -> &Enhancer { + &self.enhancer + } +} + +#[async_trait::async_trait] +impl EntityRecognizer for Boosting +where + R: EntityRecognizer + 'static, +{ + async fn recognize(&self, input: &RecognizerInput) -> Result> { + let mut output = self.inner.recognize(input).await?; + if self.enhancer.is_empty() { + return Ok(output); + } + let text = input.data.text.as_str(); + let tokens = input.artifacts.get::().map(Tokens::as_slice); + self.enhancer.enhance(&mut output.entities, text, tokens); + Ok(output) + } +} diff --git a/crates/nvisy-engine/Cargo.toml b/crates/nvisy-engine/Cargo.toml index cc4dcd90..97a99643 100644 --- a/crates/nvisy-engine/Cargo.toml +++ b/crates/nvisy-engine/Cargo.toml @@ -40,7 +40,6 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates nvisy-codec = { workspace = true, features = ["text"] } -nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } nvisy-llm = { workspace = true, features = [] } nvisy-ner = { workspace = true, features = [] } diff --git a/crates/nvisy-engine/src/core/context.rs b/crates/nvisy-engine/src/core/context.rs index 84391b2c..d988fd44 100644 --- a/crates/nvisy-engine/src/core/context.rs +++ b/crates/nvisy-engine/src/core/context.rs @@ -17,7 +17,6 @@ use std::num::NonZeroUsize; use std::sync::Arc; -use nvisy_context::ContextEnhancer; use nvisy_toolkit::detection::RecognizerRegistry; use nvisy_toolkit::extraction::ExtractorRegistry; use tokio_util::sync::CancellationToken; @@ -52,11 +51,6 @@ pub struct DetectionContext { /// engine-side detection-config template plus the request's /// label catalog. pub(crate) recognizer_registry: Arc, - /// Post-recognition keyword-boost enhancer — built alongside - /// `recognizer_registry` from the same recognizer set. Shared - /// behind `Arc` so per-document phases borrow it without - /// cloning the embedded registry / matcher. - pub(crate) context_enhancer: Arc, pub(crate) concurrency: Option, } @@ -67,7 +61,6 @@ pub struct DetectionContext { pub(crate) struct DetectionEngines { pub extraction_engine: ExtractorRegistry, pub recognizer_registry: Arc, - pub context_enhancer: Arc, } impl DetectionContext { @@ -82,14 +75,12 @@ impl DetectionContext { let DetectionEngines { extraction_engine, recognizer_registry, - context_enhancer, } = engines; Self { cancel, shared, extraction_engine, recognizer_registry, - context_enhancer, concurrency, } } @@ -108,14 +99,6 @@ impl DetectionContext { pub(crate) fn recognizer_registry(&self) -> &Arc { &self.recognizer_registry } - - /// Per-request context-keyword enhancer borrowed by - /// [`DetectionPhase`]. - /// - /// [`DetectionPhase`]: crate::detection::phases::detection::DetectionPhase - pub(crate) fn context_enhancer(&self) -> &Arc { - &self.context_enhancer - } } impl PhaseContext for DetectionContext { diff --git a/crates/nvisy-engine/src/detection/config/mod.rs b/crates/nvisy-engine/src/detection/config/mod.rs index 9dcd3a14..3f74c9ac 100644 --- a/crates/nvisy-engine/src/detection/config/mod.rs +++ b/crates/nvisy-engine/src/detection/config/mod.rs @@ -9,13 +9,17 @@ //! currently wired — those modules are parked pending rework to //! implement [`EntityRecognizer`] directly. //! +//! Each recognizer owns its own post-recognition processing +//! (boosting, deduplication-within-recognizer, validation post-pass). +//! The engine orchestrates recognizers; it does not orchestrate +//! recognizer-internal phases. +//! //! [`RecognizerRegistry`]: nvisy_toolkit::detection::RecognizerRegistry //! [`EntityRecognizer`]: nvisy_core::recognition::EntityRecognizer mod ner; mod pattern; -use nvisy_context::{ContextEnhancer, ContextRegistry}; #[cfg(not(feature = "bento"))] use nvisy_core::Error; use nvisy_core::Result; @@ -25,7 +29,7 @@ use nvisy_ner::NerRecognizer; use nvisy_ner::backend::NoopBackend; #[cfg(feature = "bento")] use nvisy_ner::backend::{BentoBackend, BentoParams}; -use nvisy_pattern::{PatternRecognizer, PatternRegistry}; +use nvisy_pattern::PatternRecognizer; use nvisy_toolkit::detection::RecognizerRegistry; pub use self::ner::{NerBackend, NerDetection}; @@ -35,26 +39,6 @@ pub use self::pattern::PatternDetection; /// provenance on emitted entities). const NER_RECOGNIZER_NAME: &str = "ner"; -/// Engine-wide defaults for the post-recognition [`ContextEnhancer`]. -/// Mirrors Presidio's defaults (`context_similarity_factor = 0.35`, -/// `context_prefix_count = ~5 words ≈ 50 bytes`). -const ENHANCER_DEFAULT_WINDOW: usize = 50; -const ENHANCER_DEFAULT_BOOST: f64 = 0.35; - -/// Bundle returned by [`DetectionConfig::build_for_request`]: -/// the per-request recognizer registry plus the matching -/// [`ContextEnhancer`] built from each recognizer's declared -/// context keywords. -pub struct DetectionResources { - /// Recognizers selected for this request. - pub recognizers: RecognizerRegistry, - /// Post-recognition keyword-boost enhancer for `Text` - /// entities. Always present; carries an empty registry when - /// no recognizer declared context keywords (cheap to skip - /// inside [`ContextEnhancer::enhance`]). - pub enhancer: ContextEnhancer, -} - /// Configuration for the [`RecognizerRegistry`]. /// /// Each field maps to a `[detection.*]` section in `Nvisy.toml`. @@ -92,19 +76,17 @@ impl DetectionConfig { /// Returns the first construction error encountered — pattern /// compile failure, NER backend init failure, or a /// config-selected backend whose feature wasn't compiled in. - pub fn build_for_request(&self, catalog: &EntityLabelCatalog) -> Result { + pub fn build_for_request(&self, catalog: &EntityLabelCatalog) -> Result { let mut reg = RecognizerRegistry::new(); - let mut context_registry = ContextRegistry::new(); let pattern_cfg = self.pattern.clone().unwrap_or_default(); if pattern_cfg.enabled { - let pattern_registry = PatternRegistry::builtin().filter_by_catalog(catalog); - if !pattern_registry.is_empty() { - context_registry = context_registry.merge(pattern_registry.context_registry()); - let recognizer = PatternRecognizer::builder() - .with_registry(pattern_registry) - .build()?; - reg = reg.with_recognizer::(recognizer); + let builder = PatternRecognizer::builder() + .with_builtin_patterns() + .with_builtin_dictionaries() + .filter_by_catalog(catalog); + if !builder.is_empty() { + reg = reg.with_recognizer::(builder.build()?); } } @@ -135,20 +117,9 @@ impl DetectionConfig { )); } }; - context_registry = context_registry.merge(recognizer.context_registry()); reg = reg.with_recognizer::(recognizer); } - let enhancer = ContextEnhancer::builder() - .with_registry(context_registry) - .with_default_window(ENHANCER_DEFAULT_WINDOW) - .with_default_boost(ENHANCER_DEFAULT_BOOST) - .build() - .expect("enhancer fields (window, boost, registry) all set"); - - Ok(DetectionResources { - recognizers: reg, - enhancer, - }) + Ok(reg) } } diff --git a/crates/nvisy-engine/src/detection/document.rs b/crates/nvisy-engine/src/detection/document.rs index ab9f2207..6698dd26 100644 --- a/crates/nvisy-engine/src/detection/document.rs +++ b/crates/nvisy-engine/src/detection/document.rs @@ -26,10 +26,7 @@ impl DetectionDocumentPipeline { pub(super) fn from_context(ctx: &DetectionContext) -> Self { Self { extraction: ExtractionPhase::new(ctx.extraction_engine().clone()), - detection: DetectionPhase::new( - ctx.recognizer_registry().clone(), - ctx.context_enhancer().clone(), - ), + detection: DetectionPhase::new(ctx.recognizer_registry().clone()), deduplication: DeduplicationPhase::new(), } } diff --git a/crates/nvisy-engine/src/detection/mod.rs b/crates/nvisy-engine/src/detection/mod.rs index b0c2e6d0..31e50884 100644 --- a/crates/nvisy-engine/src/detection/mod.rs +++ b/crates/nvisy-engine/src/detection/mod.rs @@ -28,9 +28,7 @@ mod result; mod state; mod status; -pub use self::config::{ - DetectionConfig, DetectionResources, NerBackend, NerDetection, PatternDetection, -}; +pub use self::config::{DetectionConfig, NerBackend, NerDetection, PatternDetection}; pub use self::engine::DetectionEngine; pub use self::extraction::ExtractionConfig; #[cfg(feature = "image")] diff --git a/crates/nvisy-engine/src/detection/phases/detection.rs b/crates/nvisy-engine/src/detection/phases/detection.rs index 9857308b..ecbd163f 100644 --- a/crates/nvisy-engine/src/detection/phases/detection.rs +++ b/crates/nvisy-engine/src/detection/phases/detection.rs @@ -11,10 +11,8 @@ use std::sync::Arc; -use nvisy_context::ContextEnhancer; use nvisy_core::Result; use nvisy_core::entity::Entity; -use nvisy_core::extraction::Artifacts; use nvisy_core::modality::{ Audio, AudioLocation, Image, ImageLocation, Overlap, Tabular, TabularLocation, Text, TextData, TextLocation, @@ -35,21 +33,20 @@ const TARGET: &str = "nvisy_engine::detection"; /// /// Holds an `Arc` so the registry is shared /// cheaply across per-document phases without cloning the -/// underlying recognizer lists, plus an `Arc` for -/// the post-recognition keyword-boost pass. +/// underlying recognizer lists. Recognizers own any post-detection +/// work they need (boosting, dedup, validation) — the engine just +/// orchestrates the registry. /// /// [`EntityRecord`]: crate::document::provenance::EntityRecord pub struct DetectionPhase { registry: Arc, - enhancer: Arc, } impl DetectionPhase { - /// Build the phase from the shared recognizer registry and - /// matching context enhancer. Called once per pipeline by the - /// pipeline orchestrator. - pub fn new(registry: Arc, enhancer: Arc) -> Self { - Self { registry, enhancer } + /// Build the phase from the shared recognizer registry. Called + /// once per pipeline by the pipeline orchestrator. + pub fn new(registry: Arc) -> Self { + Self { registry } } pub(crate) async fn apply_text( @@ -88,7 +85,7 @@ impl DetectionPhase { let span = tracing::info_span!(target: TARGET, "phase", name = "detection.image"); let run_id = ctx.shared().run_id; async move { - detect_text_blocks(&self.registry, &self.enhancer, &mut tree.root, run_id).await?; + detect_text_blocks(&self.registry, &mut tree.root, run_id).await?; detect_image_chunks( &self.registry, &mut tree.root, @@ -115,7 +112,7 @@ impl DetectionPhase { let span = tracing::info_span!(target: TARGET, "phase", name = "detection.text_only"); let run_id = ctx.shared().run_id; async move { - detect_text_blocks(&self.registry, &self.enhancer, doc, run_id).await?; + detect_text_blocks(&self.registry, doc, run_id).await?; Ok(()) } .instrument(span) @@ -127,7 +124,6 @@ impl DetectionPhase { /// text via [`ModalityBlock::scan_text`] (today: every modality). async fn detect_text_blocks( registry: &RecognizerRegistry, - enhancer: &ContextEnhancer, doc: &mut Document, run_id: uuid::Uuid, ) -> Result<()> @@ -154,13 +150,7 @@ where let mut input = RecognizerInput::new(TextData::new(text.to_owned())); input.correlation_id = Some(run_id); - let mut detected = registry.run::(input).await?; - // Apply context-keyword boosting in block-local coordinates, - // before lifting to modality-absolute locations. The shared - // NLP-pass producer hasn't been wired into the detection - // pipeline yet, so we pass an empty `Artifacts` — the - // enhancer's substring path runs without it. - enhancer.enhance(&mut detected, text, &Artifacts::new()); + let detected = registry.run::(input).await?; for entity in detected { let Some(location) = M::lift_from_block(&block.spans, entity.location.start, entity.location.end) diff --git a/crates/nvisy-engine/src/detection/pipeline.rs b/crates/nvisy-engine/src/detection/pipeline.rs index be2c0118..43fdd871 100644 --- a/crates/nvisy-engine/src/detection/pipeline.rs +++ b/crates/nvisy-engine/src/detection/pipeline.rs @@ -132,12 +132,12 @@ impl DetectionPipeline { ) -> Result<(Vec, u64, DetectionStatus), Error> { let actor_id = prepared.actor_id; - let (recognizer_registry, context_enhancer) = match self + let recognizer_registry = match self .state .detection_config .build_for_request(&prepared.catalog) { - Ok(r) => (Arc::new(r.recognizers), Arc::new(r.enhancer)), + Ok(r) => Arc::new(r), Err(e) => { self.detections.fail(self.detection_id, e.to_string()).await; return Err(e); @@ -161,7 +161,6 @@ impl DetectionPipeline { let engines = DetectionEngines { extraction_engine: (*self.state.extraction_engine).clone(), recognizer_registry, - context_enhancer, }; let concurrency = self.base_config.effective_concurrency(); let ctx = DetectionContext::new(cancel, Arc::new(shared_data), engines, concurrency); diff --git a/crates/nvisy-ner/Cargo.toml b/crates/nvisy-ner/Cargo.toml index 6758bc16..a6e3739e 100644 --- a/crates/nvisy-ner/Cargo.toml +++ b/crates/nvisy-ner/Cargo.toml @@ -32,7 +32,6 @@ rustdoc-args = ["--cfg", "docsrs"] [dependencies] # Internal crates -nvisy-context = { workspace = true, features = [] } nvisy-core = { workspace = true, features = [] } # Serialization diff --git a/crates/nvisy-ner/src/nlp/capabilities.rs b/crates/nvisy-ner/src/nlp/capabilities.rs index 4e4143af..49d68ef3 100644 --- a/crates/nvisy-ner/src/nlp/capabilities.rs +++ b/crates/nvisy-ner/src/nlp/capabilities.rs @@ -4,7 +4,7 @@ //! Composition-time contract between an `NlpEngine` and the //! recognizers / enhancer that read its artifacts. Lets the engine //! orchestrator refuse impossible asks at construction time — e.g. -//! wiring a lemma-aware `ContextEnhancer` to a tokenizer-only +//! wiring a lemma-aware enhancer to a tokenizer-only //! engine that doesn't produce lemmas. //! //! Booleans rather than an enum because capabilities are diff --git a/crates/nvisy-ner/src/nlp/engine.rs b/crates/nvisy-ner/src/nlp/engine.rs index 2ffa1779..2766e9e5 100644 --- a/crates/nvisy-ner/src/nlp/engine.rs +++ b/crates/nvisy-ner/src/nlp/engine.rs @@ -1,26 +1,22 @@ //! [`NlpEngine`]: the producer-side trait that builds the //! shared-NLP-pass [`TypeMap`] for one or more texts. //! -//! Engines stamp typed enrichment entries — -//! [`LanguageDetections`] and [`Tokens`] — into the returned -//! `TypeMap`. An orchestrator that wants shared NLP runs -//! `process` once per scan, wraps the result in [`Artifacts`], and -//! attaches it to each [`RecognizerInput`] via +//! Engines stamp typed enrichment entries (`LanguageDetections` +//! today; token artifacts when the upstream service supports +//! them) into the returned `TypeMap`. An orchestrator that wants +//! shared NLP runs `process` once per scan, wraps the result in +//! [`Artifacts`], and attaches it to each [`RecognizerInput`] via //! [`RecognizerInput::with_artifacts`]. //! -//! [`LanguageDetections`]: nvisy_core::primitive::LanguageDetections -//! [`Tokens`]: nvisy_context::Tokens //! [`Artifacts`]: nvisy_core::extraction::Artifacts //! [`RecognizerInput`]: nvisy_core::recognition::RecognizerInput //! [`RecognizerInput::with_artifacts`]: nvisy_core::recognition::RecognizerInput::with_artifacts //! //! Pluggable so different deployment shapes (pure language -//! detection, hosted full-NLP service, future in-process model) can -//! be wired interchangeably. The orchestrator calls `process` (or -//! `process_batch`) once per scan; recognizers and the -//! [`ContextEnhancer`] borrow the resulting map by reference. -//! -//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer +//! detection, hosted full-NLP service, future in-process model) +//! can be wired interchangeably. The orchestrator calls `process` +//! (or `process_batch`) once per scan; recognizers and the +//! keyword-boost enhancer borrow the resulting map by reference. use nvisy_core::Result; use nvisy_core::primitive::LanguageTag; diff --git a/crates/nvisy-ner/src/nlp/mod.rs b/crates/nvisy-ner/src/nlp/mod.rs index 77fef86d..ab265acc 100644 --- a/crates/nvisy-ner/src/nlp/mod.rs +++ b/crates/nvisy-ner/src/nlp/mod.rs @@ -1,13 +1,13 @@ //! Producer side of the shared-NLP-pass primitive. //! -//! Consumer-side types live in `nvisy-core` so any text consumer -//! (pattern recognizers, NER adapters, context enhancer) can read -//! them without depending on this crate: -//! [`LanguageDetections`] sits with the language primitives; -//! [`Tokens`] sits next to the [`ContextEnhancer`] that consumes -//! it. This module declares the [`NlpEngine`] trait and the -//! engines that produce those artifacts into the shared `TypeMap` -//! stamped on `RecognizerInput.artifacts`. +//! Consumer-side types live in their natural crates so any text +//! consumer can read them without depending on this one: +//! [`LanguageDetections`] sits with the language primitives in +//! `nvisy-core`; the optional token artifact lives in +//! `nvisy-context` next to its only consumer (the keyword-boost +//! `Enhancer`). This module declares the [`NlpEngine`] trait and +//! the engines that produce those artifacts into the shared +//! `TypeMap` stamped on `RecognizerInput.artifacts`. //! //! One engine ships today: //! - [`LinguaNlpEngine`] — language-only NLP, backed by the @@ -21,9 +21,7 @@ //! The trait is async because realistic implementations are //! HTTP-bound or otherwise yield. //! -//! [`Tokens`]: nvisy_context::Tokens //! [`LanguageDetections`]: nvisy_core::primitive::LanguageDetections -//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer //! [`lingua`]: https://crates.io/crates/lingua //! [`NerBackend`]: crate::backend::NerBackend //! [`NerRecognizer`]: crate::NerRecognizer diff --git a/crates/nvisy-ner/src/recognition/config.rs b/crates/nvisy-ner/src/recognition/config.rs index a50c7b6f..c8af8c87 100644 --- a/crates/nvisy-ner/src/recognition/config.rs +++ b/crates/nvisy-ner/src/recognition/config.rs @@ -55,16 +55,6 @@ pub struct NerModel { /// Alignment policy for sub-word predictions. Same advisory /// status as `aggregation`. pub alignment: AlignmentMode, - /// Per-recognizer context-keyword list for the post-recognition - /// [`ContextEnhancer`]. - /// Empty when the recognizer doesn't participate in boosting. - /// Each emitted entity's source name keys the lookup, so the - /// recognizer's [`name`] is used - /// as the registration key. - /// - /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer - /// [`name`]: super::NerRecognizer::name - pub default_context: Vec, } impl Default for NerModel { @@ -77,7 +67,6 @@ impl Default for NerModel { low_score_multiplier: 0.4, aggregation: AggregationStrategy::Max, alignment: AlignmentMode::Expand, - default_context: Vec::new(), } } } @@ -108,7 +97,6 @@ impl NerModelBuilder { .unwrap_or(defaults.low_score_multiplier), aggregation: self.aggregation.unwrap_or(defaults.aggregation), alignment: self.alignment.unwrap_or(defaults.alignment), - default_context: self.default_context.unwrap_or(defaults.default_context), } } } diff --git a/crates/nvisy-ner/src/recognition/recognizer.rs b/crates/nvisy-ner/src/recognition/recognizer.rs index bbdca67b..1c4647e6 100644 --- a/crates/nvisy-ner/src/recognition/recognizer.rs +++ b/crates/nvisy-ner/src/recognition/recognizer.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use derive_builder::Builder; -use nvisy_context::{Context, ContextRegistry}; use nvisy_core::entity::{Entity, EntityLabelRef, ModelProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Text, TextLocation}; use nvisy_core::primitive::Confidence; @@ -37,12 +36,8 @@ use crate::backend::{NerBackend, NerRequest, RawNerSpan}; build_fn(error = "Error", name = "try_build", private) )] pub struct NerRecognizer { - /// Recognizer name. Surfaced in trail provenance and used as - /// the key the [`ContextEnhancer`] looks up to find the - /// recognizer's [`default_context`]. - /// - /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer - /// [`default_context`]: NerModel::default_context + /// Recognizer name. Surfaced in trail provenance on every + /// emitted entity. name: String, /// Backend that turns `(text, kinds)` into raw spans. Required. /// Set via [`with_engine`], which accepts any concrete @@ -92,24 +87,6 @@ impl NerRecognizer { &self.model } - /// Build a [`ContextRegistry`] containing this recognizer's - /// [`default_context`] keyed on the recognizer's name. Returns - /// an empty registry when no keywords were declared. - /// - /// Mirrors `PatternRegistry::context_registry` so engine code - /// can merge per-recognizer contexts from every text-modality - /// recognizer into one enhancer input without duplicating the - /// keyword data. - /// - /// [`default_context`]: NerModel::default_context - #[must_use] - pub fn context_registry(&self) -> ContextRegistry { - ContextRegistry::new().with_entry( - self.name.clone(), - Context::new(self.model.default_context.iter().cloned()), - ) - } - fn build_entity(&self, span: &RawNerSpan, label: EntityLabelRef) -> Entity { let raw_confidence = Confidence::try_clamped(span.score).unwrap_or(self.model.default_score); diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 1c17cc80..d89d43fc 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -29,7 +29,6 @@ nvisy-core = { workspace = true, features = [] } # Serialization serde = { workspace = true, features = [] } -schemars = { workspace = true, features = [] } toml = { workspace = true, features = ["parse"] } # Derive macros and error handling diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index 8f86d1ee..7d299119 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -2,20 +2,41 @@ [![Build](https://img.shields.io/github/actions/workflow/status/nvisycom/runtime/build.yml?branch=main&label=build%20%26%20test&style=flat-square)](https://github.com/nvisycom/runtime/actions/workflows/build.yml) -Built-in patterns, dictionaries, and validators for PII/PHI detection in the +Regex and dictionary recognizers for PII / PHI detection in the Nvisy runtime. ## Overview -A pre-compiled pattern engine for PII/PHI detection. Each scan runs -regex (`RegexSet`-prefiltered), dictionary lookup (Aho-Corasick), -and deny-list injection. Built-in patterns and dictionaries live as -JSON under `assets/` and are embedded at compile time. - -Per-scan inputs (allow / deny lists, context-keyword hints, -caller-supplied ad-hoc patterns) flow through `PatternContext` without -rebuilding the engine. Regex patterns can opt into post-match -validation by name (e.g. `"luhn"`, `"ssn"`, `"iban"`). +`PatternRecognizer` compiles a set of `Regex` rules (each holding +one or more regex `Variant`s, a Presidio-shaped multi-strategy +group) and `Dictionary` term lists into pooled scanners — one +shared `regex::RegexSet` for the regex side and one shared +`aho_corasick::AhoCorasick` automaton for the literal side. A +single walk over the input runs both scanners and emits +`Entity` values in modality-local byte coordinates. + +Each rule may declare per-label context keywords; the recognizer +wraps itself in a `nvisy_context::Boosting` layer at build time +that lifts confidence on matches whose neighbourhood contains a +declared keyword. + +The built-in pattern + dictionary set lives as TOML under +`assets/` and is embedded at compile time. The recognizer's +builder accepts both built-ins and user-supplied rules: + +```rust +use nvisy_pattern::PatternRecognizer; + +let recognizer = PatternRecognizer::builder() + .with_builtin_patterns() + .with_builtin_dictionaries() + .build() + .expect("built-in recognizer builds"); +``` + +Regex variants can opt into a post-match validator by name +(`"luhn"`, `"ssn"`, `"iban"`, `"phone"`, `"date"`); custom +validators can be registered via `ValidatorRegistry::with`. ## Documentation diff --git a/crates/nvisy-pattern/assets/dictionaries/general/languages.toml b/crates/nvisy-pattern/assets/dictionaries/general/languages.toml index d356fef1..0a7e0aee 100644 --- a/crates/nvisy-pattern/assets/dictionaries/general/languages.toml +++ b/crates/nvisy-pattern/assets/dictionaries/general/languages.toml @@ -1,12 +1,7 @@ name = "languages" label = "language" -score = 0.85 -# Per-CSV-column overrides: -# column 0 = long-form names (`English`, `Spanish`, ...) — high -# confidence; collisions with everyday words are rare. -# column 1 = ISO 639-1 codes (`en`, `es`, ...) — low confidence; -# two-letter codes routinely collide with English words -# like `or` (Odia), `it` (Italian), `am` (Amharic). -# Below the dedup default threshold of 0.5 so they -# drop unless an operator explicitly lowers the floor. -column_scores = [0.85, 0.30] + +# column 0 = long-form names (`English`, `Spanish`, ...) +# column 1 = ISO 639-1 codes (`en`, `es`, ...) +# column 2 = alternate long-form names (`Farsi` for Persian) +score = [0.85, 0.30, 0.85] diff --git a/crates/nvisy-pattern/assets/patterns/contact/email.toml b/crates/nvisy-pattern/assets/patterns/contact/email.toml index 13e70bcf..fb37ff45 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/email.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/email.toml @@ -1,4 +1,6 @@ name = "email" label = "email_address" + +[[variants]] regex = "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/contact/phone.toml b/crates/nvisy-pattern/assets/patterns/contact/phone.toml index b2e1faf5..01df2224 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/phone.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/phone.toml @@ -1,9 +1,8 @@ name = "phone" label = "phone_number" +context = ["phone", "call", "mobile", "tel", "fax", "contact"] + +[[variants]] regex = "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b" score = 0.8 validator = "phone" - -[context] -keywords = ["phone", "call", "mobile", "tel", "fax", "contact"] -penalty = 0.15 diff --git a/crates/nvisy-pattern/assets/patterns/contact/url.toml b/crates/nvisy-pattern/assets/patterns/contact/url.toml index 24c3c9a2..ec11fcee 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/url.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/url.toml @@ -1,4 +1,6 @@ name = "url" label = "url" + +[[variants]] regex = "\\bhttps?://[^\\s/$.?#][^\\s]*\\b" score = 0.9 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml index 2748a222..189aacc9 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml @@ -1,4 +1,6 @@ name = "aws-key" label = "api_key" + +[[variants]] regex = "\\bAKIA[0-9A-Z]{16}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml index 4c851fa1..be69abc5 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml @@ -1,4 +1,6 @@ name = "generic-api-key" label = "api_key" + +[[variants]] regex = "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?" score = 0.7 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml b/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml index 39c9bb1c..ba247e60 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml @@ -1,4 +1,6 @@ name = "github-token" label = "auth_token" + +[[variants]] regex = "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml index cdaff752..61d6977e 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml @@ -1,4 +1,6 @@ name = "private-key" label = "private_key" + +[[variants]] regex = "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----" score = 0.98 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml index 73437d3b..127517f6 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml @@ -1,4 +1,6 @@ name = "stripe-key" label = "api_key" + +[[variants]] regex = "\\bsk_(live|test)_[a-zA-Z0-9]{24,}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml b/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml index 23f78873..a68c435a 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml @@ -1,4 +1,6 @@ name = "bitcoin-address" label = "crypto_address" + +[[variants]] regex = "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml index 9d73cd20..78b3325a 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml @@ -1,8 +1,8 @@ name = "credit-card" label = "payment_card" +context = ["card", "credit", "debit", "payment", "visa", "mastercard", "amex"] + +[[variants]] regex = "\\b(?:\\d[ \\-]*?){13,19}\\b" score = 0.85 validator = "luhn" - -[context] -keywords = ["card", "credit", "debit", "payment", "visa", "mastercard", "amex"] diff --git a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml b/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml index 02fa0939..2860d8a4 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml @@ -1,4 +1,6 @@ name = "ethereum-address" label = "crypto_address" + +[[variants]] regex = "\\b0x[0-9a-fA-F]{40}\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/finance/iban.toml b/crates/nvisy-pattern/assets/patterns/finance/iban.toml index 364dff6c..7256b240 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/iban.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/iban.toml @@ -1,8 +1,8 @@ name = "iban" label = "iban" +context = ["iban", "bank", "account", "transfer", "swift"] + +[[variants]] regex = "\\b[A-Z]{2}\\d{2}\\s?[A-Z0-9]{4}\\s?(?:\\d{4}\\s?){2,7}\\d{1,4}\\b" score = 0.85 validator = "iban" - -[context] -keywords = ["iban", "bank", "account", "transfer", "swift"] diff --git a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml b/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml index 39b5c508..7147b65c 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml @@ -1,4 +1,6 @@ name = "swift-code" label = "swift_code" + +[[variants]] regex = "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b" score = 0.7 diff --git a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml b/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml index b6fadd82..12010716 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml @@ -1,4 +1,6 @@ name = "us-bank-routing" label = "bank_routing" + +[[variants]] regex = "\\b(?:0[1-9]|[12]\\d|3[0-2])\\d{7}\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml b/crates/nvisy-pattern/assets/patterns/identity/ssn.toml index 17028ed5..f2076b26 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/ssn.toml @@ -1,8 +1,8 @@ name = "ssn" label = "government_id" +context = ["social security", "ssn", "tax id", "taxpayer identification"] + +[[variants]] regex = "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b" score = 0.9 validator = "ssn" - -[context] -keywords = ["social security", "ssn", "tax id", "taxpayer identification"] diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml b/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml index 0720e2b9..873af318 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml @@ -1,4 +1,6 @@ name = "us-drivers-license" label = "drivers_license" + +[[variants]] regex = "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b" score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml b/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml index 48da58bd..d7087d83 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml @@ -1,4 +1,6 @@ name = "us-passport" label = "passport_number" + +[[variants]] regex = "\\b[A-Z]\\d{8}\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml b/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml index adf40e1d..737b391f 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml @@ -1,4 +1,6 @@ name = "us-postal-code" label = "postal_code" + +[[variants]] regex = "\\b\\d{5}(?:-\\d{4})?\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml b/crates/nvisy-pattern/assets/patterns/network/ipv4.toml index 914c6b46..d64403dd 100644 --- a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml +++ b/crates/nvisy-pattern/assets/patterns/network/ipv4.toml @@ -1,4 +1,6 @@ name = "ipv4" label = "ip_address" + +[[variants]] regex = "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b" score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml b/crates/nvisy-pattern/assets/patterns/network/ipv6.toml index 0107ad00..dfc12ecd 100644 --- a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml +++ b/crates/nvisy-pattern/assets/patterns/network/ipv6.toml @@ -1,4 +1,6 @@ name = "ipv6" label = "ip_address" + +[[variants]] regex = "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b" score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml b/crates/nvisy-pattern/assets/patterns/network/mac_address.toml index 2766fc31..fcca5944 100644 --- a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml +++ b/crates/nvisy-pattern/assets/patterns/network/mac_address.toml @@ -1,4 +1,6 @@ name = "mac-address" label = "mac_address" + +[[variants]] regex = "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml index 8bafa63f..c88f21b9 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml @@ -1,9 +1,8 @@ name = "date_of_birth" label = "date_of_birth" +context = ["birth", "born", "dob", "birthday"] + +[[variants]] regex = "\\b(?:(?:0[1-9]|1[0-2]|[1-9])[/\\-](?:0[1-9]|[12]\\d|3[01]|[1-9])[/\\-](?:19|20)\\d{2}|(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01]))\\b" score = 0.6 validator = "date" - -[context] -keywords = ["birth", "born", "dob", "birthday"] -penalty = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml index 10ed0de4..c1e00f7a 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml @@ -1,7 +1,7 @@ name = "datetime" label = "date_time" +context = ["timestamp", "created", "modified", "logged", "at", "time"] + +[[variants]] regex = "\\b(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[T ](?:[01]\\d|2[0-3]):[0-5]\\d(?::[0-5]\\d)?(?:Z|[+\\-]\\d{2}:?\\d{2})?\\b" score = 0.7 - -[context] -keywords = ["timestamp", "created", "modified", "logged", "at", "time"] diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index 3222cca9..ed069016 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -7,7 +7,7 @@ mod shipped; pub mod validators; pub use self::recognition::{ - Dictionary, DictionaryBuilder, PatternRecognizer, PatternRecognizerBuilder, PatternRegistry, - Regex, RegexBuilder, Terms, + Dictionary, DictionaryBuilder, PatternRecognizer, PatternRecognizerBuilder, Regex, + RegexBuilder, Scoring, Term, Terms, Variant, VariantBuilder, }; pub use self::shipped::{dictionaries, patterns}; diff --git a/crates/nvisy-pattern/src/recognition/compiled.rs b/crates/nvisy-pattern/src/recognition/compiled.rs new file mode 100644 index 00000000..1283025f --- /dev/null +++ b/crates/nvisy-pattern/src/recognition/compiled.rs @@ -0,0 +1,166 @@ +//! Compiled, recognizer-ready forms of [`Regex`] rules and +//! [`Dictionary`]s. +//! +//! [`PatternRecognizerBuilder::build`] compiles each regex variant +//! into a [`::regex::Regex`] and folds every dictionary's terms +//! into a shared [`AhoCorasick`] automaton, then stores the +//! per-rule emission metadata next to those scanners. This module +//! holds the per-rule metadata structs ([`CompiledPattern`], +//! [`CompiledDictionary`]) and their `build_entity` constructors — +//! the bits that turn a regex / Aho-Corasick hit into an +//! `Entity`. +//! +//! [`Regex`]: super::Regex +//! [`Dictionary`]: super::Dictionary +//! [`AhoCorasick`]: aho_corasick::AhoCorasick +//! [`PatternRecognizerBuilder::build`]: super::PatternRecognizerBuilder::build + +use std::sync::Arc; + +use nvisy_core::entity::{Entity, EntityLabelRef, PatternProvenance, TrailProvenance, TrailStep}; +use nvisy_core::modality::{Text, TextLocation}; +use nvisy_core::primitive::{Confidence, LanguageTag}; +use regex::Regex; + +use crate::validators::Validator; + +/// One compiled regex slot: a single `(pattern, variant)` pair, +/// keyed in the shared `RegexSet` by its position in +/// `PatternRecognizer.patterns`. Pattern-level metadata (name, +/// label, languages) is repeated across the pattern's variants so +/// the dispatch loop has everything it needs without a second +/// indirection. +/// +/// `context` is intentionally not stored on compiled state — the +/// recognizer's wrapping `Boosting` layer harvests keywords from +/// the source patterns at build time. +pub(super) struct CompiledPattern { + /// Pattern name (e.g. `"ssn"`). Surfaced in trail provenance. + pub pattern_name: String, + pub label: EntityLabelRef, + pub regex: Regex, + pub score: Confidence, + pub validator: Option>, + /// Languages the parent pattern applies to. + /// Empty means "any language". + pub languages: Vec, +} + +impl CompiledPattern { + /// Emit an `Entity` for a regex match at `[start, end)` + /// in modality-local byte coordinates. The recognizer phase + /// lifts the location to absolute document coordinates after + /// dispatch. + pub(super) fn build_entity(&self, start: usize, end: usize) -> Entity { + let provenance = TrailProvenance::Pattern(PatternProvenance::Regex { + name: self.pattern_name.clone(), + regex: Some(self.regex.as_str().to_owned()), + validator: self.validator.as_ref().map(|_| self.pattern_name.clone()), + contextual: false, + }); + let step = TrailStep::recognition( + "pattern", + self.score, + provenance, + format!("pattern `{}` matched", self.pattern_name), + ); + Entity::builder() + .with_label(self.label.clone()) + .with_trail(vec![step]) + .with_confidence(self.score) + .with_location(TextLocation::new(start, end)) + .build() + .expect("required fields provided") + } +} + +/// Source of truth for one runtime dictionary: its term range +/// inside the shared Aho-Corasick automaton, plus per-dictionary +/// emission metadata. +pub(super) struct CompiledDictionary { + pub name: String, + pub label: EntityLabelRef, + /// First term-id (inclusive) for this dictionary inside the + /// shared automaton. + pub term_start: usize, + /// One past the last term-id for this dictionary inside the + /// shared automaton. + pub term_end: usize, + /// Per-term confidence, indexed by `term_id - term_start`. + /// Resolved at compile time from the dictionary's `scoring` + /// policy and any per-term overrides. + pub term_scores: Vec, + /// Languages this dictionary applies to. Empty means "any + /// language". + pub languages: Vec, + /// Reject matches whose immediate neighbours are word + /// characters (alphanumeric or `_`). Mirrors regex `\b`. + pub word_boundary: bool, +} + +impl CompiledDictionary { + /// Emit an `Entity` for an Aho-Corasick hit at + /// `[start, end)` in modality-local byte coordinates. `score` + /// is the per-term confidence resolved at recognizer-build + /// time (the dictionary's `scoring` policy or per-term + /// override). + pub(super) fn build_entity(&self, score: Confidence, start: usize, end: usize) -> Entity { + let provenance = TrailProvenance::Pattern(PatternProvenance::Dictionary { + name: self.name.clone(), + contextual: false, + }); + let step = TrailStep::recognition( + "pattern", + score, + provenance, + format!("dictionary `{}` matched", self.name), + ); + Entity::builder() + .with_label(self.label.clone()) + .with_trail(vec![step]) + .with_confidence(score) + .with_location(TextLocation::new(start, end)) + .build() + .expect("required fields provided") + } +} + +/// Mirror of regex `\b` for the byte range `text[start..end]`: +/// the immediate neighbour characters (or start/end of input) +/// must not be word characters. A word character here is Unicode +/// alphanumeric or `_`, matching the conventional regex +/// definition. +/// +/// Operates on `char` boundaries, not raw bytes, so multibyte +/// codepoints don't trigger false rejections (`é` is one char, +/// not two). +pub(super) fn has_word_boundaries(text: &str, start: usize, end: usize) -> bool { + let left_is_word = text[..start].chars().next_back().is_some_and(is_word_char); + let right_is_word = text[end..].chars().next().is_some_and(is_word_char); + !left_is_word && !right_is_word +} + +fn is_word_char(c: char) -> bool { + c.is_alphanumeric() || c == '_' +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn has_word_boundaries_handles_edges_and_unicode() { + // Match touches both edges of the input → boundaries OK. + assert!(has_word_boundaries("hello", 0, 5)); + // Match preceded by a word char → not a boundary. + assert!(!has_word_boundaries("example", 5, 7)); + // Match followed by a word char → not a boundary. + assert!(!has_word_boundaries("amount", 0, 2)); + // Space surround → boundaries OK. + assert!(has_word_boundaries(" am ", 1, 3)); + // Unicode word char on the left → not a boundary. + assert!(!has_word_boundaries("café_am", 5, 7)); + // Punctuation around → boundaries OK. + assert!(has_word_boundaries("(am)", 1, 3)); + } +} diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index 7875c2a2..bf20a1e0 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -1,40 +1,111 @@ //! [`Dictionary`]: literal-term detection rule. //! //! A dictionary scans for a fixed list of literal strings using an -//! Aho-Corasick automaton. Compared with [`Pattern`], a dictionary +//! Aho-Corasick automaton. Compared with [`Regex`], a dictionary //! has no regex engine, no validator, and a single shared confidence //! score applied to every match. //! -//! Construct via: +//! Construct via [`Dictionary::builder`] for the chainable style or +//! [`Dictionary::from_toml`] for a self-contained TOML source. //! -//! - [`Dictionary::builder`] — chainable, ground-up -//! - [`Dictionary::from_toml`] — self-contained TOML +//! Term sources are first-class — see [`Terms`] for [`from_text`] +//! and [`from_csv`] constructors. The builder's [`with_terms`] +//! setter accepts anything convertible to [`Terms`]. //! -//! Term sources are first-class — see [`Terms`] for -//! [`from_text`] and -//! [`from_csv`] constructors. The builder's -//! [`with_terms`] setter accepts -//! anything convertible to [`Terms`]. -//! -//! [`Pattern`]: crate::Pattern +//! [`Regex`]: crate::Regex //! [`Terms`]: crate::Terms //! [`from_text`]: crate::Terms::from_text //! [`from_csv`]: crate::Terms::from_csv //! [`with_terms`]: DictionaryBuilder::with_terms use derive_builder::Builder; -use nvisy_context::Context; use nvisy_core::Error; use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use super::terms::Terms; +/// Confidence policy for a [`Dictionary`]'s matches. +/// +/// Either every term gets the same score ([`Uniform`]), or scores +/// are picked per CSV source column ([`PerColumn`]). The untagged +/// serde representation accepts a bare number for the uniform +/// case and an array for the per-column case: +/// +/// ```toml +/// score = 0.9 # Uniform +/// score = [0.85, 0.30] # PerColumn +/// ``` +/// +/// [`Uniform`]: Scoring::Uniform +/// [`PerColumn`]: Scoring::PerColumn +#[derive(Debug, Clone, PartialEq, Deserialize)] +#[serde(untagged)] +pub enum Scoring { + /// Single confidence stamped on every match. The common case. + Uniform(Confidence), + /// Per-column confidence vector. `[i]` is the confidence + /// stamped on every term whose source CSV column was `i`. A + /// term from a column past the end of this vec is a + /// recognizer-build error — define one score per column. + PerColumn(Vec), +} + +impl Scoring { + /// Validate the policy's internal shape. A + /// `PerColumn(vec![])` can never resolve a score for any + /// column, so callers (the recognizer at build time) surface + /// it as a configuration error. + /// + /// # Errors + /// + /// Returns the human-readable reason the policy is invalid. + pub fn validate(&self) -> Result<(), &'static str> { + match self { + Self::Uniform(_) => Ok(()), + Self::PerColumn(scores) if scores.is_empty() => { + Err("PerColumn scoring with no scores can never resolve") + } + Self::PerColumn(_) => Ok(()), + } + } + + /// Resolve a score for `column`. `Uniform` ignores the column + /// and always returns its score; `PerColumn` returns the entry + /// at `column`, or `None` when no column is supplied or the + /// index is past the end of the per-column vector. Callers + /// decide the fall-back policy (per-term override, hard + /// error, default constant, etc.). + #[must_use] + pub fn get(&self, column: Option) -> Option { + match self { + Self::Uniform(s) => Some(*s), + Self::PerColumn(scores) => column.and_then(|c| scores.get(c as usize).copied()), + } + } +} + +impl Default for Scoring { + fn default() -> Self { + Self::Uniform(Confidence::MAX) + } +} + /// Literal-term detection rule. -#[derive(Debug, Clone, PartialEq, Builder)] -#[derive(Serialize, Deserialize, JsonSchema)] +/// +/// ``` +/// use nvisy_core::entity::builtins; +/// use nvisy_pattern::{Dictionary, Terms}; +/// +/// let dictionary = Dictionary::builder() +/// .with_name("nationalities") +/// .with_label(builtins::NATIONALITY.label_ref()) +/// .with_terms(Terms::from(["German", "French", "Italian"])) +/// .build() +/// .expect("nationalities dictionary builds"); +/// ``` +#[derive(Debug, Clone, PartialEq, Builder, Deserialize)] #[builder( name = "DictionaryBuilder", pattern = "owned", @@ -46,49 +117,39 @@ pub struct Dictionary { pub name: String, /// Entity label every match emits. pub label: EntityLabelRef, - /// Literal terms to scan for. The recognizer compiles these into - /// an Aho-Corasick automaton at build time. + /// Literal terms to scan for. The recognizer compiles these + /// into an Aho-Corasick automaton at build time. pub terms: Terms, - /// Confidence score stamped on every match before any boost. - #[builder(default = "Confidence::MAX")] - pub score: Confidence, - /// Optional context keywords carried through to emitted entities - /// for a downstream enhancer to apply boosts. + /// Confidence policy: uniform across every term, or per CSV + /// source column. Defaults to [`Scoring::Uniform`] with + /// [`Confidence::MAX`]. #[builder(default)] - #[serde(default, skip_serializing_if = "context_is_default")] - pub context: Context, + #[serde(default, rename = "score")] + pub scoring: Scoring, + /// Context keywords that lift confidence when one of them + /// appears near a match. Harvested by the engine into a + /// per-label `BoostRule` in `nvisy-context`; the recognizer + /// itself never reads this field. + #[builder(default)] + #[serde(default)] + pub context: Vec, /// Languages the dictionary applies to (BCP-47 tags). An empty - /// list (the default) means the dictionary applies regardless of - /// language; otherwise the recognizer skips this dictionary when - /// the per-call language hint is set to a tag not in this list. + /// list (the default) means the dictionary applies regardless + /// of language; otherwise the recognizer skips this dictionary + /// when the per-call language hint is set to a tag not in this + /// list. #[builder(default)] - #[serde(default, skip_serializing_if = "Vec::is_empty")] - #[schemars(with = "Vec")] + #[serde(default)] pub languages: Vec, /// Require word-boundary surroundings on every match. With the - /// default of `true`, a term `"am"` matches the word `"am"` but - /// not the `"am"` inside `"example"`. Word characters are + /// default of `true`, a term `"am"` matches the word `"am"` + /// but not the `"am"` inside `"example"`. Word characters are /// alphanumerics and `_` (Unicode-aware). Set to `false` for /// dictionaries that genuinely want substring matching (e.g. /// scanning for embedded credentials inside arbitrary tokens). #[builder(default = "true")] #[serde(default = "default_word_boundary")] pub word_boundary: bool, - /// Per-column confidence overrides for terms loaded from a - /// multi-column CSV. `column_scores[i]` is the confidence - /// stamped on every term whose source column was `i`; terms - /// from a column past the end of this vec fall back to the - /// dictionary's default `score`. Useful when one column - /// carries unambiguous long-form names (`English`, `Spanish`) - /// and another carries short codes (`en`, `es`) that collide - /// with common words. - /// - /// Empty (the default) means "use `score` for every match", - /// preserving the historical behaviour of single-confidence - /// dictionaries. - #[builder(default)] - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub column_scores: Vec, } fn default_word_boundary() -> bool { @@ -143,8 +204,8 @@ impl Dictionary { let mut builder = Dictionary::builder() .with_name(metadata.name) .with_label(metadata.label); - if let Some(score) = metadata.score { - builder = builder.with_score(score); + if let Some(scoring) = metadata.score { + builder = builder.with_scoring(scoring); } if let Some(context) = metadata.context { builder = builder.with_context(context); @@ -152,29 +213,20 @@ impl Dictionary { if let Some(wb) = metadata.word_boundary { builder = builder.with_word_boundary(wb); } - if let Some(cs) = metadata.column_scores { - builder = builder.with_column_scores(cs); - } Ok(builder) } } /// Wire shape for the dictionary metadata sidecar TOML — every /// field [`Dictionary`] carries except `terms`. -#[derive(Debug, Clone, Serialize, Deserialize)] +#[derive(Debug, Clone, Deserialize)] struct DictionaryMetadata { name: String, label: EntityLabelRef, #[serde(default)] - score: Option, + score: Option, #[serde(default)] - context: Option, + context: Option>, #[serde(default)] word_boundary: Option, - #[serde(default)] - column_scores: Option>, -} - -fn context_is_default(ctx: &Context) -> bool { - ctx.is_empty() && ctx.window.is_none() && ctx.boost.is_none() } diff --git a/crates/nvisy-pattern/src/recognition/mod.rs b/crates/nvisy-pattern/src/recognition/mod.rs index 0ce29c61..d6d2d18e 100644 --- a/crates/nvisy-pattern/src/recognition/mod.rs +++ b/crates/nvisy-pattern/src/recognition/mod.rs @@ -1,19 +1,18 @@ -//! Recognition primitives — the rule shapes ([`Regex`], -//! [`Dictionary`]), their building blocks ([`Terms`] plus -//! [`Context`] from `nvisy-context`), -//! the [`PatternRegistry`] that bundles them, and the runtime -//! [`PatternRecognizer`] that compiles them into pooled scanners. -//! -//! [`Context`]: nvisy_context::Context +//! Recognition primitives — the rule shapes ([`Regex`] + its +//! [`Variant`]s, [`Dictionary`]), their building blocks ([`Terms`]), +//! and the runtime [`PatternRecognizer`] that compiles them into +//! pooled scanners. Per-rule and per-dictionary `context` keyword +//! lists are harvested by the recognizer at build time into a +//! wrapping `Boosting` layer that applies post-recognition keyword +//! boosts. +mod compiled; mod dictionary; mod recognizer; -mod regex_rule; -mod registry; +mod regex; mod terms; -pub use self::dictionary::{Dictionary, DictionaryBuilder}; +pub use self::dictionary::{Dictionary, DictionaryBuilder, Scoring}; pub use self::recognizer::{PatternRecognizer, PatternRecognizerBuilder}; -pub use self::regex_rule::{Regex, RegexBuilder}; -pub use self::registry::PatternRegistry; -pub use self::terms::Terms; +pub use self::regex::{Regex, RegexBuilder, Variant, VariantBuilder}; +pub use self::terms::{Term, Terms}; diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index ce987ee3..01bc5533 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -1,5 +1,5 @@ -//! [`PatternRecognizer`]: compiles a [`PatternRegistry`] into pooled -//! scanners and implements [`EntityRecognizer`]. +//! [`PatternRecognizer`]: compiles patterns and dictionaries into +//! pooled scanners and implements [`EntityRecognizer`]. //! //! The internal split is intentional: regex patterns go into a //! single [`regex::RegexSet`] for a one-pass scan across every @@ -7,64 +7,41 @@ //! [`aho_corasick::AhoCorasick`] automaton for a one-pass scan //! across every literal. Both passes share one walk over the input //! and emit entities in modality-local byte coordinates. - -use std::sync::Arc; +//! +//! Construction is builder-driven: [`PatternRecognizer::builder`] +//! returns a [`PatternRecognizerBuilder`] that accumulates patterns, +//! dictionaries, and (optionally) a custom validator registry, then +//! compiles everything into the scanners on [`build`]. The shipped +//! built-in pattern + dictionary set is [`PatternRecognizerBuilder::builtin`]. +//! +//! [`build`]: PatternRecognizerBuilder::build use aho_corasick::{AhoCorasick, MatchKind}; -use nvisy_core::entity::{Entity, EntityLabelRef, PatternProvenance, TrailProvenance, TrailStep}; -use nvisy_core::modality::{Text, TextLocation}; -use nvisy_core::primitive::{Confidence, LanguageTag}; +use nvisy_context::{BoostRule, Boosting, Enhancer, SubstringMatcher}; +use nvisy_core::entity::{Entity, EntityLabelCatalog, EntityLabelRef}; +use nvisy_core::modality::Text; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; use nvisy_core::{Error, Result}; -use regex::{Regex, RegexSet}; +use regex::RegexSet; -use super::registry::PatternRegistry; -use crate::validators::{Validator, ValidatorRegistry}; +use super::compiled::{CompiledDictionary, CompiledPattern, has_word_boundaries}; +use super::dictionary::Dictionary; +use super::regex::Regex; +use crate::shipped; +use crate::validators::ValidatorRegistry; -/// Source of truth for one runtime pattern: the regex compiled -/// once, plus the metadata needed to emit entities. +/// Runtime text recognizer composed of one regex pool and one +/// Aho-Corasick automaton. /// -/// `context` is intentionally not stored on the compiled state — -/// the recognizer never reads it; the [`ContextEnhancer`] looks it -/// up directly on the [`PatternRegistry`] at boost time. +/// ``` +/// use nvisy_pattern::PatternRecognizer; /// -/// [`ContextEnhancer`]: nvisy_context::ContextEnhancer -struct CompiledPattern { - name: String, - label: EntityLabelRef, - regex: Regex, - raw_regex: String, - score: Confidence, - validator: Option>, - /// Languages this pattern applies to. Empty means "any language". - languages: Vec, -} - -/// Source of truth for one runtime dictionary: its term range -/// inside the shared Aho-Corasick automaton, plus per-dictionary -/// emission metadata. -struct CompiledDictionary { - name: String, - label: EntityLabelRef, - /// First term-id (inclusive) for this dictionary inside the - /// shared automaton. - term_start: usize, - /// One past the last term-id for this dictionary inside the - /// shared automaton. - term_end: usize, - /// Per-term confidence, indexed by `term_id - term_start`. - /// Resolved at compile time from the dictionary's - /// `column_scores` override (when set) or its default `score`. - term_scores: Vec, - /// Languages this dictionary applies to. Empty means "any - /// language". - languages: Vec, - /// Reject matches whose immediate neighbours are word - /// characters (alphanumeric or `_`). Mirrors regex `\b`. - word_boundary: bool, -} - -/// Composes a [`PatternRegistry`] into a single text recognizer. +/// let recognizer = PatternRecognizer::builder() +/// .with_builtin_patterns() +/// .with_builtin_dictionaries() +/// .build() +/// .expect("built-in recognizer builds"); +/// ``` pub struct PatternRecognizer { patterns: Vec, regex_set: Option, @@ -73,28 +50,77 @@ pub struct PatternRecognizer { } impl PatternRecognizer { - /// Start assembling a recognizer. Required: a registry, supplied - /// via [`with_registry`]. + /// Start a builder. Required: at least one pattern or + /// dictionary; otherwise [`build`] succeeds with a recognizer + /// that always emits zero entities. /// - /// [`with_registry`]: PatternRecognizerBuilder::with_registry + /// [`build`]: PatternRecognizerBuilder::build #[must_use] pub fn builder() -> PatternRecognizerBuilder { PatternRecognizerBuilder::default() } + + fn dictionary_owning_term(&self, term_id: usize) -> Option<&CompiledDictionary> { + self.dictionaries + .iter() + .find(|d| term_id >= d.term_start && term_id < d.term_end) + } } -/// Builder for [`PatternRecognizer`]. -#[derive(Default)] +/// Accumulates patterns, dictionaries, and a validator registry, +/// then compiles them into a [`PatternRecognizer`] wrapped in a +/// [`Boosting`] layer. +#[derive(Debug, Clone, Default)] pub struct PatternRecognizerBuilder { - registry: Option, + patterns: Vec, + dictionaries: Vec, validators: Option, } impl PatternRecognizerBuilder { - /// Attach the pattern + dictionary registry to compile. + /// Construct an empty builder. #[must_use] - pub fn with_registry(mut self, registry: PatternRegistry) -> Self { - self.registry = Some(registry); + pub fn new() -> Self { + Self::default() + } + + /// Pre-seed with the shipped built-in pattern + dictionary set. + /// Shorthand for + /// `Self::new().with_builtin_patterns().with_builtin_dictionaries()`. + #[must_use] + pub fn builtin() -> Self { + Self::new() + .with_builtin_patterns() + .with_builtin_dictionaries() + } + + /// Register one pattern. Patterns accumulate in registration + /// order. + #[must_use] + pub fn with_pattern(mut self, pattern: Regex) -> Self { + self.patterns.push(pattern); + self + } + + /// Register one dictionary. Dictionaries accumulate in + /// registration order. + #[must_use] + pub fn with_dictionary(mut self, dictionary: Dictionary) -> Self { + self.dictionaries.push(dictionary); + self + } + + /// Register every shipped built-in pattern. + #[must_use] + pub fn with_builtin_patterns(mut self) -> Self { + self.patterns.extend(shipped::patterns::all()); + self + } + + /// Register every shipped built-in dictionary. + #[must_use] + pub fn with_builtin_dictionaries(mut self) -> Self { + self.dictionaries.extend(shipped::dictionaries::all()); self } @@ -106,52 +132,109 @@ impl PatternRecognizerBuilder { self } + /// Drop every pattern and dictionary whose `label` is not + /// registered in `catalog`. Used to build a per-request + /// recognizer from a workspace-wide template — rules that + /// would emit labels no policy declared never run. + #[must_use] + pub fn filter_by_catalog(mut self, catalog: &EntityLabelCatalog) -> Self { + self.patterns + .retain(|p| catalog.lookup(p.label.as_str()).is_some()); + self.dictionaries + .retain(|d| catalog.lookup(d.label.as_str()).is_some()); + self + } + + /// `true` when the builder has no patterns and no + /// dictionaries. Engine code uses this to skip the + /// per-request recognizer entirely when the catalog filter + /// dropped every rule. + #[must_use] + pub fn is_empty(&self) -> bool { + self.patterns.is_empty() && self.dictionaries.is_empty() + } + + /// Borrow the accumulated patterns. + #[must_use] + pub fn patterns(&self) -> &[Regex] { + &self.patterns + } + + /// Borrow the accumulated dictionaries. + #[must_use] + pub fn dictionaries(&self) -> &[Dictionary] { + &self.dictionaries + } + /// Compile every registered pattern and dictionary into the - /// pooled scanners. + /// pooled scanners and wrap the recognizer in a [`Boosting`] + /// layer carrying per-label keyword boosts harvested from the + /// same set of rules. /// /// # Errors /// - /// Returns an error when no registry was supplied, when a - /// pattern's regex fails to compile, when a pattern references - /// an unknown validator name, or when the shared automata - /// cannot be constructed. - pub fn build(self) -> Result { - let registry = self.registry.ok_or_else(|| { - Error::validation( - "PatternRecognizer requires a registry — call `with_registry` first", - "nvisy-pattern", - ) - })?; - let validators = self.validators.unwrap_or_else(ValidatorRegistry::builtin); - let mut compiled_patterns = Vec::with_capacity(registry.patterns().len()); - let mut regex_sources = Vec::with_capacity(registry.patterns().len()); - - for pattern in registry.patterns() { - let regex = Regex::new(&pattern.regex).map_err(|e| { - Error::validation( - format!("pattern `{}`: invalid regex: {e}", pattern.name), - "nvisy-pattern", - ) - })?; - let validator = match pattern.validator.as_deref() { - None => None, - Some(name) => Some(validators.resolve(name).ok_or_else(|| { + /// Returns a validation error when a pattern variant's regex + /// fails to compile, when a variant references an unknown + /// validator name, when a dictionary's `scoring` is invalid + /// or under-declared for some term's source column, or when + /// the shared automata cannot be constructed. + pub fn build(self) -> Result> { + let validators = self + .validators + .clone() + .unwrap_or_else(ValidatorRegistry::builtin); + let (compiled_patterns, regex_set) = self.compile_patterns(&validators)?; + let (compiled_dicts, aho) = self.compile_dictionaries()?; + let enhancer = self.build_enhancer(); + + let recognizer = PatternRecognizer { + patterns: compiled_patterns, + regex_set, + dictionaries: compiled_dicts, + aho, + }; + + Ok(Boosting::new(recognizer, enhancer)) + } + + /// Compile every `(pattern, variant)` pair into a + /// [`CompiledPattern`] keyed by its slot in the shared + /// [`RegexSet`]. + fn compile_patterns( + &self, + validators: &ValidatorRegistry, + ) -> Result<(Vec, Option)> { + let variant_total: usize = self.patterns.iter().map(|p| p.variants.len()).sum(); + let mut compiled = Vec::with_capacity(variant_total); + let mut regex_sources = Vec::with_capacity(variant_total); + + for pattern in &self.patterns { + for variant in &pattern.variants { + let regex = ::regex::Regex::new(&variant.regex).map_err(|e| { Error::validation( - format!("pattern `{}`: unknown validator `{}`", pattern.name, name), + format!("pattern `{}`: invalid regex: {e}", pattern.name), "nvisy-pattern", ) - })?), - }; - regex_sources.push(pattern.regex.clone()); - compiled_patterns.push(CompiledPattern { - name: pattern.name.clone(), - label: pattern.label.clone(), - regex, - raw_regex: pattern.regex.clone(), - score: pattern.score, - validator, - languages: pattern.languages.clone(), - }); + })?; + let validator = match variant.validator.as_deref() { + None => None, + Some(name) => Some(validators.resolve(name).ok_or_else(|| { + Error::validation( + format!("pattern `{}`: unknown validator `{}`", pattern.name, name), + "nvisy-pattern", + ) + })?), + }; + regex_sources.push(variant.regex.clone()); + compiled.push(CompiledPattern { + pattern_name: pattern.name.clone(), + label: pattern.label.clone(), + regex, + score: variant.score, + validator, + languages: pattern.languages.clone(), + }); + } } let regex_set = if regex_sources.is_empty() { @@ -161,25 +244,55 @@ impl PatternRecognizerBuilder { Error::validation(format!("compiling regex set: {e}"), "nvisy-pattern") })?) }; + Ok((compiled, regex_set)) + } - let mut compiled_dicts = Vec::with_capacity(registry.dictionaries().len()); + /// Compile every dictionary into a [`CompiledDictionary`] + /// with its term-id range inside the shared Aho-Corasick + /// automaton, plus per-term confidences resolved from the + /// dictionary's `scoring` policy (with per-term overrides + /// taking precedence). + fn compile_dictionaries(&self) -> Result<(Vec, Option)> { + let mut compiled = Vec::with_capacity(self.dictionaries.len()); let mut all_terms: Vec = Vec::new(); - for dict in registry.dictionaries() { + + for dict in &self.dictionaries { + if let Err(reason) = dict.scoring.validate() { + return Err(Error::validation( + format!("dictionary `{}`: {reason}", dict.name), + "nvisy-pattern", + )); + } let term_start = all_terms.len(); let mut term_scores = Vec::with_capacity(dict.terms.len()); for entry in dict.terms.entries() { all_terms.push(entry.term.clone()); - // Resolve column → score. Out-of-range columns fall - // back to the dictionary's default score. - let score = dict - .column_scores - .get(entry.column as usize) - .copied() - .unwrap_or(dict.score); + // Per-term `score` wins when set; otherwise ask + // the dictionary's `Scoring` to resolve against + // the term's source column. `None` means the + // column didn't map to a declared score — + // surfaced as a hard build error so silent + // misconfiguration can't happen. + let score = entry + .score + .or_else(|| dict.scoring.get(entry.column)) + .ok_or_else(|| { + let column_desc = entry + .column + .map_or_else(|| "no column".to_owned(), |c| format!("column {c}")); + Error::validation( + format!( + "dictionary `{}`: term `{}` ({column_desc}) has no score in \ + dictionary scoring", + dict.name, entry.term, + ), + "nvisy-pattern", + ) + })?; term_scores.push(score); } let term_end = all_terms.len(); - compiled_dicts.push(CompiledDictionary { + compiled.push(CompiledDictionary { name: dict.name.clone(), label: dict.label.clone(), term_start, @@ -196,12 +309,12 @@ impl PatternRecognizerBuilder { Some( AhoCorasick::builder() .ascii_case_insensitive(false) - // Longest-match-at-position: when both `en` and - // `English` start at the same offset, return - // `English`. Without this, the short ISO code - // would win and word-boundary post-filtering - // would then reject it, dropping the legitimate - // long-form match. + // Longest-match-at-position: when both `en` + // and `English` start at the same offset, + // return `English`. Without this, the short + // ISO code would win and word-boundary + // post-filtering would then reject it, + // dropping the legitimate long-form match. .match_kind(MatchKind::LeftmostLongest) .build(&all_terms) .map_err(|e| { @@ -212,13 +325,33 @@ impl PatternRecognizerBuilder { })?, ) }; + Ok((compiled, aho)) + } - Ok(PatternRecognizer { - patterns: compiled_patterns, - regex_set, - dictionaries: compiled_dicts, - aho, - }) + /// Build the wrapping [`Enhancer`] from per-pattern and + /// per-dictionary context keywords. + fn build_enhancer(&self) -> Enhancer { + let boost_rules: Vec = self + .context_keywords() + .map(|(label, keywords)| BoostRule::for_label(label.clone(), keywords.iter().cloned())) + .collect(); + Enhancer::new(boost_rules, Box::new(SubstringMatcher)) + } + + /// Yield `(label, keywords)` for every pattern and dictionary + /// that declares a non-empty context. + fn context_keywords(&self) -> impl Iterator { + let pattern_keywords = self + .patterns + .iter() + .filter(|p| !p.context.is_empty()) + .map(|p| (&p.label, p.context.as_slice())); + let dict_keywords = self + .dictionaries + .iter() + .filter(|d| !d.context.is_empty()) + .map(|d| (&d.label, d.context.as_slice())); + pattern_keywords.chain(dict_keywords) } } @@ -226,7 +359,7 @@ impl PatternRecognizerBuilder { impl EntityRecognizer for PatternRecognizer { async fn recognize(&self, input: &RecognizerInput) -> Result> { let text = input.data.text.as_str(); - let mut entities = Vec::new(); + let mut entities: Vec> = Vec::new(); if let Some(set) = self.regex_set.as_ref() { for pattern_id in set.matches(text).into_iter() { @@ -240,7 +373,7 @@ impl EntityRecognizer for PatternRecognizer { { continue; } - entities.push(build_pattern_entity(pat, m.start(), m.end())); + entities.push(pat.build_entity(m.start(), m.end())); } } } @@ -258,7 +391,7 @@ impl EntityRecognizer for PatternRecognizer { continue; } let score = dict.term_scores[term_id - dict.term_start]; - entities.push(build_dictionary_entity(dict, score, mat.start(), mat.end())); + entities.push(dict.build_entity(score, mat.start(), mat.end())); } } @@ -266,88 +399,14 @@ impl EntityRecognizer for PatternRecognizer { } } -impl PatternRecognizer { - fn dictionary_owning_term(&self, term_id: usize) -> Option<&CompiledDictionary> { - self.dictionaries - .iter() - .find(|d| term_id >= d.term_start && term_id < d.term_end) - } -} - -fn build_pattern_entity(pat: &CompiledPattern, start: usize, end: usize) -> Entity { - let provenance = TrailProvenance::Pattern(PatternProvenance::Regex { - name: pat.name.clone(), - regex: Some(pat.raw_regex.clone()), - validator: pat.validator.as_ref().map(|_| pat.name.clone()), - contextual: false, - }); - let step = TrailStep::recognition( - "pattern", - pat.score, - provenance, - format!("pattern `{}` matched", pat.name), - ); - Entity::builder() - .with_label(pat.label.clone()) - .with_trail(vec![step]) - .with_confidence(pat.score) - .with_location(TextLocation::new(start, end)) - .build() - .expect("required fields provided") -} - -/// Mirror of regex `\b` for the byte range `text[start..end]`: -/// the immediate neighbour characters (or start/end of input) must -/// not be word characters. A word character here is Unicode -/// alphanumeric or `_`, matching the conventional regex definition. -/// -/// Operates on `char` boundaries, not raw bytes, so multibyte -/// codepoints don't trigger false rejections (`é` is one char, not -/// two). -fn has_word_boundaries(text: &str, start: usize, end: usize) -> bool { - let left_is_word = text[..start].chars().next_back().is_some_and(is_word_char); - let right_is_word = text[end..].chars().next().is_some_and(is_word_char); - !left_is_word && !right_is_word -} - -fn is_word_char(c: char) -> bool { - c.is_alphanumeric() || c == '_' -} - -fn build_dictionary_entity( - dict: &CompiledDictionary, - score: Confidence, - start: usize, - end: usize, -) -> Entity { - let provenance = TrailProvenance::Pattern(PatternProvenance::Dictionary { - name: dict.name.clone(), - contextual: false, - }); - let step = TrailStep::recognition( - "pattern", - score, - provenance, - format!("dictionary `{}` matched", dict.name), - ); - Entity::builder() - .with_label(dict.label.clone()) - .with_trail(vec![step]) - .with_confidence(score) - .with_location(TextLocation::new(start, end)) - .build() - .expect("required fields provided") -} - #[cfg(test)] mod tests { - use nvisy_core::entity::builtins; - use nvisy_core::modality::TextData; + use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; + use nvisy_core::modality::{Text, TextData}; use nvisy_core::recognition::RecognizerInput; use super::*; use crate::Dictionary; - use crate::recognition::registry::PatternRegistry; use crate::recognition::terms::Terms; fn dict(name: &str, terms: &[&str], word_boundary: bool) -> Dictionary { @@ -360,7 +419,7 @@ mod tests { .expect("dictionary builds") } - async fn run(recognizer: &PatternRecognizer, text: &str) -> Vec> { + async fn run(recognizer: &impl EntityRecognizer, text: &str) -> Vec> { let input = RecognizerInput::new(TextData::new(text.to_owned())); recognizer .recognize(&input) @@ -371,9 +430,8 @@ mod tests { #[tokio::test] async fn word_boundary_rejects_substring_matches() { - let registry = PatternRegistry::new().with_dictionary(dict("langs", &["am", "or"], true)); let recognizer = PatternRecognizer::builder() - .with_registry(registry) + .with_dictionary(dict("langs", &["am", "or"], true)) .build() .expect("recognizer builds"); @@ -391,29 +449,12 @@ mod tests { #[tokio::test] async fn word_boundary_disabled_keeps_substring_matches() { - let registry = PatternRegistry::new().with_dictionary(dict("langs", &["am"], false)); let recognizer = PatternRecognizer::builder() - .with_registry(registry) + .with_dictionary(dict("langs", &["am"], false)) .build() .expect("recognizer builds"); let entities = run(&recognizer, "example").await; assert_eq!(entities.len(), 1, "substring match must be kept"); } - - #[test] - fn has_word_boundaries_handles_edges_and_unicode() { - // Match touches both edges of the input → boundaries OK. - assert!(has_word_boundaries("hello", 0, 5)); - // Match preceded by a word char → not a boundary. - assert!(!has_word_boundaries("example", 5, 7)); - // Match followed by a word char → not a boundary. - assert!(!has_word_boundaries("amount", 0, 2)); - // Space surround → boundaries OK. - assert!(has_word_boundaries(" am ", 1, 3)); - // Unicode word char on the left → not a boundary. - assert!(!has_word_boundaries("café_am", 5, 7)); - // Punctuation around → boundaries OK. - assert!(has_word_boundaries("(am)", 1, 3)); - } } diff --git a/crates/nvisy-pattern/src/recognition/regex.rs b/crates/nvisy-pattern/src/recognition/regex.rs new file mode 100644 index 00000000..0d762fcc --- /dev/null +++ b/crates/nvisy-pattern/src/recognition/regex.rs @@ -0,0 +1,149 @@ +//! [`Regex`]: per-label regex-based detection rule. +//! +//! A `Regex` rule bundles one entity label, its context-keyword +//! list, and one or more [`Variant`]s. Each variant carries its +//! own regex source, emission score, and optional named +//! validator. All variants under one rule emit the same label. +//! +//! Construct via [`Regex::builder`] for the chainable style or +//! [`Regex::from_toml`] when loading a definition file. + +use derive_builder::Builder; +use nvisy_core::Error; +use nvisy_core::entity::EntityLabelRef; +use nvisy_core::primitive::{Confidence, LanguageTag}; +use serde::Deserialize; + +/// One regex variant inside a [`Regex`] rule. Carries the regex +/// source, the emission confidence stamped on every match, and the +/// optional validator name resolved at recognizer-build time. +#[derive(Debug, Clone, PartialEq, Builder, Deserialize)] +#[builder( + name = "VariantBuilder", + pattern = "owned", + setter(into, strip_option, prefix = "with"), + build_fn(error = "Error", validate = "VariantBuilder::validate") +)] +pub struct Variant { + /// Regex source. Compiled to a [`::regex::Regex`] by + /// [`PatternRecognizer::build`]; shape errors there, not here. + /// + /// [`PatternRecognizer::build`]: super::PatternRecognizer + pub regex: String, + /// Confidence score stamped on every match this variant emits + /// before any post-recognition boost. + #[builder(default = "Confidence::MAX")] + pub score: Confidence, + /// Optional validator name. Resolved at recognizer-build time + /// against the [`ValidatorRegistry`]; matches that fail + /// validation are dropped. + /// + /// [`ValidatorRegistry`]: crate::validators::ValidatorRegistry + #[builder(default)] + #[serde(default)] + pub validator: Option, +} + +impl Variant { + /// Start a chainable builder. Required field: `regex`. + #[must_use] + pub fn builder() -> VariantBuilder { + VariantBuilder::default() + } +} + +impl VariantBuilder { + fn validate(&self) -> Result<(), Error> { + if let Some(regex) = self.regex.as_ref() + && let Err(e) = ::regex::Regex::new(regex) + { + return Err(Error::validation( + format!("invalid regex: {e}"), + "nvisy-pattern", + )); + } + Ok(()) + } +} + +/// Regex-based detection rule: one label, optional boost +/// keywords, one or more [`Variant`]s. Matches the Presidio +/// "pattern recognizer" shape — multiple regex strategies for one +/// entity type, plus a shared context keyword list. +/// +/// ``` +/// use nvisy_core::entity::builtins; +/// use nvisy_core::primitive::Confidence; +/// use nvisy_pattern::{Regex, Variant}; +/// +/// let variant = Variant::builder() +/// .with_regex(r"\b\d{3}-\d{2}-\d{4}\b") +/// .with_score(Confidence::clamped(0.9)) +/// .with_validator("ssn") +/// .build() +/// .expect("ssn variant builds"); +/// +/// let ssn = Regex::builder() +/// .with_name("ssn") +/// .with_label(builtins::GOVERNMENT_ID.label_ref()) +/// .with_context(vec!["ssn".to_owned(), "social security".to_owned()]) +/// .with_variants(vec![variant]) +/// .build() +/// .expect("ssn rule builds"); +/// ``` +#[derive(Debug, Clone, PartialEq, Builder, Deserialize)] +#[builder( + name = "RegexBuilder", + pattern = "owned", + setter(into, strip_option, prefix = "with"), + build_fn(error = "Error") +)] +pub struct Regex { + /// Human-readable identifier (e.g. `"ssn"`, `"credit_card"`). + /// Surfaced in trail steps so downstream consumers can see + /// which rule matched. + pub name: String, + /// Entity label every variant emits. + pub label: EntityLabelRef, + /// Context keywords that lift confidence when one of them + /// appears near a match. Harvested by [`PatternRecognizer`] + /// into a per-label boost rule; rules themselves never read + /// this field. + /// + /// [`PatternRecognizer`]: super::PatternRecognizer + #[builder(default)] + #[serde(default)] + pub context: Vec, + /// Regex variants. At least one is required for the rule to + /// produce any matches; the recognizer skips rules with no + /// variants. + pub variants: Vec, + /// Languages this rule applies to (BCP-47 tags). An empty + /// list (the default) means the rule applies regardless of + /// language; otherwise the recognizer skips this rule when + /// the per-call language hint is set to a tag not in this + /// list. + #[builder(default)] + #[serde(default)] + pub languages: Vec, +} + +impl Regex { + /// Start a chainable builder. Required fields: `name`, + /// `label`, `variants`. + #[must_use] + pub fn builder() -> RegexBuilder { + RegexBuilder::default() + } + + /// Parse a regex rule from a TOML string. + /// + /// # Errors + /// + /// Returns a validation error when the TOML is malformed or + /// missing required fields. + pub fn from_toml(raw: &str) -> Result { + toml::from_str(raw) + .map_err(|e| Error::validation(format!("regex rule TOML: {e}"), "nvisy-pattern")) + } +} diff --git a/crates/nvisy-pattern/src/recognition/regex_rule.rs b/crates/nvisy-pattern/src/recognition/regex_rule.rs deleted file mode 100644 index 55f303ca..00000000 --- a/crates/nvisy-pattern/src/recognition/regex_rule.rs +++ /dev/null @@ -1,107 +0,0 @@ -//! [`Regex`]: regex-backed detection rule. -//! -//! A regex rule bundles a regular expression with the entity kind -//! it detects, an emission confidence score, optional context -//! keywords that downstream enhancers can boost on, and an optional -//! named validator (Luhn, IBAN, …) the recognizer runs over each -//! match before emitting an entity. -//! -//! Construct via [`Regex::builder`] for the chainable style or -//! [`Regex::from_toml`] when loading a definition file. - -use derive_builder::Builder; -use nvisy_context::Context; -use nvisy_core::Error; -use nvisy_core::entity::EntityLabelRef; -use nvisy_core::primitive::{Confidence, LanguageTag}; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; - -/// Regex-backed detection rule. -/// -/// Identical fields whether built via [`RegexBuilder`] or loaded -/// from a TOML file via [`Regex::from_toml`]. -#[derive(Debug, Clone, PartialEq, Builder)] -#[derive(Serialize, Deserialize, JsonSchema)] -#[builder( - name = "RegexBuilder", - pattern = "owned", - setter(into, strip_option, prefix = "with"), - build_fn(error = "Error", validate = "RegexBuilder::validate") -)] -pub struct Regex { - /// Human-readable identifier (e.g. `"ssn"`, `"credit_card"`). - /// Surfaced in trail steps so downstream consumers can see - /// which rule matched. - pub name: String, - /// Entity label every match emits. - pub label: EntityLabelRef, - /// Regex source. Compiled to a [`regex::Regex`] by - /// [`PatternRecognizer::build`]; shape - /// errors there, not here. - /// - /// [`PatternRecognizer::build`]: super::PatternRecognizer - pub regex: String, - /// Confidence score stamped on every match before any boost. - #[builder(default = "Confidence::MAX")] - pub score: Confidence, - /// Optional context keywords. Carried through to emitted - /// entities so a downstream enhancer can apply boosts. - #[builder(default)] - #[serde(default, skip_serializing_if = "context_is_default")] - pub context: Context, - /// Optional validator name. Resolved at recognizer build time - /// against the [`ValidatorRegistry`]. - /// Matches that fail validation are dropped. - /// - /// [`ValidatorRegistry`]: crate::validators::ValidatorRegistry - #[builder(default)] - #[serde(default, skip_serializing_if = "Option::is_none")] - pub validator: Option, - /// Languages the rule applies to (BCP-47 tags). An empty list - /// (the default) means the rule applies regardless of language; - /// otherwise the recognizer skips this rule when the per-call - /// language hint is set to a tag not in this list. - #[builder(default)] - #[serde(default, skip_serializing_if = "Vec::is_empty")] - #[schemars(with = "Vec")] - pub languages: Vec, -} - -impl Regex { - /// Start a chainable builder. Required fields: `name`, - /// `label`, `regex`. - #[must_use] - pub fn builder() -> RegexBuilder { - RegexBuilder::default() - } - - /// Parse a regex rule from a TOML string. - /// - /// # Errors - /// - /// Returns a validation error when the TOML is malformed or - /// missing required fields. - pub fn from_toml(raw: &str) -> Result { - toml::from_str(raw) - .map_err(|e| Error::validation(format!("regex TOML: {e}"), "nvisy-pattern")) - } -} - -impl RegexBuilder { - fn validate(&self) -> Result<(), Error> { - if let Some(regex) = self.regex.as_ref() - && let Err(e) = ::regex::Regex::new(regex) - { - return Err(Error::validation( - format!("invalid regex: {e}"), - "nvisy-pattern", - )); - } - Ok(()) - } -} - -fn context_is_default(ctx: &Context) -> bool { - ctx.is_empty() && ctx.window.is_none() && ctx.boost.is_none() -} diff --git a/crates/nvisy-pattern/src/recognition/registry.rs b/crates/nvisy-pattern/src/recognition/registry.rs deleted file mode 100644 index c763661a..00000000 --- a/crates/nvisy-pattern/src/recognition/registry.rs +++ /dev/null @@ -1,167 +0,0 @@ -//! [`PatternRegistry`]: a curated bundle of [`Regex`]es and -//! [`Dictionary`]s that downstream consumers borrow. -//! -//! Both [`PatternRecognizer`] and the shared [`ContextEnhancer`] -//! consume a registry — the recognizer compiles its rules into -//! pooled scanners; the enhancer reads per-rule context keywords -//! via [`PatternRegistry::context_registry`]. -//! -//! Centralising the rule set here means no duplication of -//! [`Regex`] / [`Dictionary`] storage between the two consumers. -//! -//! [`PatternRecognizer`]: super::PatternRecognizer -//! [`ContextEnhancer`]: nvisy_context::ContextEnhancer - -use nvisy_context::ContextRegistry; -use nvisy_core::entity::EntityLabelCatalog; - -use super::dictionary::Dictionary; -use super::regex_rule::Regex; -use crate::shipped; - -/// Bundle of regexes and dictionaries shared by every downstream -/// consumer. -/// -/// Cheap to clone (`Vec` of small structs). Construct via -/// [`PatternRegistry::new`] for an empty registry, -/// [`PatternRegistry::builtin`] for the shipped registry (every -/// built-in regex + dictionary), or chain [`with_pattern`] / -/// [`with_dictionary`] / [`with_builtin_patterns`] / -/// [`with_builtin_dictionaries`] to mix custom rules in. -/// -/// [`with_pattern`]: PatternRegistry::with_pattern -/// [`with_dictionary`]: PatternRegistry::with_dictionary -/// [`with_builtin_patterns`]: PatternRegistry::with_builtin_patterns -/// [`with_builtin_dictionaries`]: PatternRegistry::with_builtin_dictionaries -#[derive(Debug, Clone, Default)] -pub struct PatternRegistry { - regexes: Vec, - dictionaries: Vec, -} - -impl PatternRegistry { - /// Construct an empty registry. - #[must_use] - pub fn new() -> Self { - Self::default() - } - - /// Construct the shipped registry: every built-in regex pattern - /// and every built-in dictionary, in registration order. - /// Shorthand for `PatternRegistry::new().with_builtin_patterns().with_builtin_dictionaries()`. - #[must_use] - pub fn builtin() -> Self { - Self::new() - .with_builtin_patterns() - .with_builtin_dictionaries() - } - - /// Register one regex. Call once per regex; the registry - /// accumulates them in registration order. - #[must_use] - pub fn with_pattern(mut self, regex: Regex) -> Self { - self.regexes.push(regex); - self - } - - /// Register one dictionary. Call once per dictionary; the - /// registry accumulates them in registration order. - #[must_use] - pub fn with_dictionary(mut self, dictionary: Dictionary) -> Self { - self.dictionaries.push(dictionary); - self - } - - /// Register every shipped built-in regex pattern in registration - /// order. Replaces the common `for p in patterns::all() { reg = - /// reg.with_pattern(p); }` boilerplate. - #[must_use] - pub fn with_builtin_patterns(mut self) -> Self { - self.regexes.extend(shipped::patterns::all()); - self - } - - /// Register every shipped built-in dictionary in registration - /// order. Replaces the common `dictionaries::all().into_iter() - /// .fold(reg, PatternRegistry::with_dictionary)` boilerplate. - #[must_use] - pub fn with_builtin_dictionaries(mut self) -> Self { - self.dictionaries.extend(shipped::dictionaries::all()); - self - } - - /// Borrow the registered regexes. - #[must_use] - pub fn patterns(&self) -> &[Regex] { - &self.regexes - } - - /// Borrow the registered dictionaries. - #[must_use] - pub fn dictionaries(&self) -> &[Dictionary] { - &self.dictionaries - } - - /// Drop every regex and dictionary whose `label` is not - /// registered in `catalog`. Used to build a per-request - /// registry from the workspace template — patterns that would - /// emit labels no policy declared never run. - #[must_use] - pub fn filter_by_catalog(mut self, catalog: &EntityLabelCatalog) -> Self { - self.regexes - .retain(|r| catalog.lookup(r.label.as_str()).is_some()); - self.dictionaries - .retain(|d| catalog.lookup(d.label.as_str()).is_some()); - self - } - - /// `true` when the registry has no regexes and no dictionaries. - #[must_use] - pub fn is_empty(&self) -> bool { - self.regexes.is_empty() && self.dictionaries.is_empty() - } - - /// Build a [`ContextRegistry`] containing every per-rule - /// context keyword declaration in this registry. - /// - /// Each [`Regex`] and [`Dictionary`] that declares a non-empty - /// context contributes one entry, keyed on its rule name. - /// Rules without context declarations are skipped. - /// - /// Use this to wire the - /// [`ContextEnhancer`] - /// against the same source of truth the recognizer compiles - /// from — no duplication of keyword data between rule - /// registration and enhancer construction. - /// - /// [`ContextEnhancer`]: nvisy_context::ContextEnhancer - #[must_use] - pub fn context_registry(&self) -> ContextRegistry { - let mut registry = ContextRegistry::new(); - for r in &self.regexes { - registry = registry.with_entry(r.name.clone(), r.context.clone()); - } - for d in &self.dictionaries { - registry = registry.with_entry(d.name.clone(), d.context.clone()); - } - registry - } -} - -impl FromIterator for PatternRegistry { - fn from_iter>(iter: I) -> Self { - Self { - regexes: iter.into_iter().collect(), - dictionaries: Vec::new(), - } - } -} - -impl FromIterator for PatternRegistry { - fn from_iter>(iter: I) -> Self { - Self { - regexes: Vec::new(), - dictionaries: iter.into_iter().collect(), - } - } -} diff --git a/crates/nvisy-pattern/src/recognition/terms.rs b/crates/nvisy-pattern/src/recognition/terms.rs index 7c17ddd1..d59ec141 100644 --- a/crates/nvisy-pattern/src/recognition/terms.rs +++ b/crates/nvisy-pattern/src/recognition/terms.rs @@ -17,33 +17,86 @@ use std::io::Cursor; use nvisy_core::Error; -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; +use nvisy_core::primitive::Confidence; +use serde::Deserialize; -/// Literal term list. Each term carries the **column index** it -/// came from (CSV column number, 0-based; non-CSV sources always -/// use column `0`). The column index is the join key for -/// [`Dictionary::column_scores`] per-column overrides. +/// Literal term list. Each [`Term`] carries an optional source +/// column (set by [`Terms::from_csv`]) plus an optional per-term +/// score override. The column index is the join key for +/// [`Dictionary::scoring`] when it's [`Scoring::PerColumn`]. /// -/// JSON-transparent: serialises to / deserialises from a JSON array -/// of `[term, column]` pairs. -/// -/// [`Dictionary::column_scores`]: crate::Dictionary::column_scores -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, JsonSchema)] +/// [`Dictionary::scoring`]: crate::Dictionary::scoring +/// [`Scoring::PerColumn`]: crate::Scoring::PerColumn +#[derive(Debug, Clone, PartialEq, Default, Deserialize)] #[serde(transparent)] -pub struct Terms(Vec); +pub struct Terms(Vec); -/// One entry in a [`Terms`] list: the literal plus the column it -/// was loaded from. Serde-renamed so the wire shape is the compact -/// tuple `[term, column]` rather than a verbose object. -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] -pub struct TermEntry { +/// One entry in a [`Terms`] list: the literal, the column it was +/// loaded from (when applicable), and an optional explicit score +/// that overrides the dictionary's [`Scoring`] policy for this +/// term. +/// +/// Per-term score is `None` for the common path — the dictionary's +/// [`Scoring`] resolves the per-match score from the column. +/// Set `score` only for one-off exceptions (e.g. a term known to +/// be high-confidence even though its column is generally noisy). +/// +/// Per-term column is `None` for non-CSV sources (plain text +/// lists, the `From>` / array impls). `Some(i)` flags +/// a CSV cell from column `i`; the dictionary's +/// [`Scoring::PerColumn`] uses it to pick the per-column score. +/// +/// [`Scoring`]: crate::Scoring +/// [`Scoring::PerColumn`]: crate::Scoring::PerColumn +#[derive(Debug, Clone, PartialEq, Deserialize)] +pub struct Term { /// The literal scanned for. pub term: String, - /// CSV column the term came from (0-based). `0` for any - /// non-CSV source. + /// CSV column the term came from. `None` for non-CSV + /// sources; `Some(i)` for the cell at column `i` of a CSV. #[serde(default)] - pub column: u16, + pub column: Option, + /// Optional per-term score override. When `Some`, the + /// recognizer stamps this score on every match of this term; + /// when `None`, falls back to the dictionary's [`Scoring`] + /// policy resolved against [`column`]. + /// + /// [`Scoring`]: crate::Scoring + /// [`column`]: Self::column + #[serde(default)] + pub score: Option, +} + +impl Term { + /// Construct a term with no column and no per-term score + /// override. The common path for plain-text sources and + /// programmatic `From<…>` constructions. + #[must_use] + pub fn new(term: impl Into) -> Self { + Self { + term: term.into(), + column: None, + score: None, + } + } + + /// Attach a CSV source-column index, used by the dictionary's + /// [`Scoring::PerColumn`] to pick a per-column score. + /// + /// [`Scoring::PerColumn`]: crate::Scoring::PerColumn + #[must_use] + pub fn with_column(mut self, column: u16) -> Self { + self.column = Some(column); + self + } + + /// Set an explicit per-term score, overriding the dictionary's + /// column-resolved score for this term. + #[must_use] + pub fn with_score(mut self, score: Confidence) -> Self { + self.score = Some(score); + self + } } impl Terms { @@ -55,7 +108,7 @@ impl Terms { /// Borrow the inner entries. #[must_use] - pub fn entries(&self) -> &[TermEntry] { + pub fn entries(&self) -> &[Term] { &self.0 } @@ -73,44 +126,42 @@ impl Terms { /// Consume into the inner entries. #[must_use] - pub fn into_inner(self) -> Vec { + pub fn into_inner(self) -> Vec { self.0 } /// Parse terms from plain-text bytes — one term per line. - /// Each line is trimmed; empty lines and lines starting with `#` - /// are skipped. Every term gets column `0`. + /// Each line is trimmed; empty lines and lines starting with + /// `#` are skipped. Plain-text terms carry no column. /// /// # Errors /// - /// Returns a validation error when the input is not valid UTF-8. + /// Returns a validation error when the input is not valid + /// UTF-8. pub fn from_text(bytes: &[u8]) -> Result { let text = std::str::from_utf8(bytes) .map_err(|e| Error::validation(format!("terms text: {e}"), "nvisy-pattern"))?; - let entries: Vec = text + let entries: Vec = text .lines() .map(str::trim) .filter(|line| !line.is_empty() && !line.starts_with('#')) - .map(|line| TermEntry { - term: line.to_owned(), - column: 0, - }) + .map(Term::new) .collect(); Ok(Self(entries)) } - /// Parse terms from CSV bytes. Every non-empty cell across every - /// row becomes a term, and each term remembers the (0-based) - /// column index it came from so a [`Dictionary`] can apply - /// per-column confidence overrides via - /// [`Dictionary::column_scores`]. + /// Parse terms from CSV bytes. Every non-empty cell across + /// every row becomes a term, and each term remembers the + /// (0-based) column index it came from so a [`Dictionary`] + /// can apply per-column confidence overrides via + /// [`Scoring::PerColumn`]. /// /// # Errors /// /// Returns a validation error when the CSV is malformed. /// /// [`Dictionary`]: crate::Dictionary - /// [`Dictionary::column_scores`]: crate::Dictionary::column_scores + /// [`Scoring::PerColumn`]: crate::Scoring::PerColumn pub fn from_csv(bytes: &[u8]) -> Result { let mut reader = csv::ReaderBuilder::new() .has_headers(false) @@ -123,10 +174,8 @@ impl Terms { for (col_idx, cell) in row.iter().enumerate() { let trimmed = cell.trim(); if !trimmed.is_empty() { - entries.push(TermEntry { - term: trimmed.to_owned(), - column: u16::try_from(col_idx).unwrap_or(u16::MAX), - }); + let column = u16::try_from(col_idx).unwrap_or(u16::MAX); + entries.push(Term::new(trimmed).with_column(column)); } } } @@ -136,50 +185,24 @@ impl Terms { impl From> for Terms { fn from(terms: Vec) -> Self { - Self( - terms - .into_iter() - .map(|term| TermEntry { term, column: 0 }) - .collect(), - ) + Self(terms.into_iter().map(Term::new).collect()) } } impl From<&[&str]> for Terms { fn from(terms: &[&str]) -> Self { - Self( - terms - .iter() - .map(|s| TermEntry { - term: (*s).to_owned(), - column: 0, - }) - .collect(), - ) + Self(terms.iter().copied().map(Term::new).collect()) } } impl From<[&str; N]> for Terms { fn from(terms: [&str; N]) -> Self { - Self( - terms - .iter() - .map(|s| TermEntry { - term: (*s).to_owned(), - column: 0, - }) - .collect(), - ) + Self(terms.iter().copied().map(Term::new).collect()) } } impl From<[String; N]> for Terms { fn from(terms: [String; N]) -> Self { - Self( - terms - .into_iter() - .map(|term| TermEntry { term, column: 0 }) - .collect(), - ) + Self(terms.into_iter().map(Term::new).collect()) } } diff --git a/crates/nvisy-pattern/src/shipped/mod.rs b/crates/nvisy-pattern/src/shipped/mod.rs index faec8ff8..062acea8 100644 --- a/crates/nvisy-pattern/src/shipped/mod.rs +++ b/crates/nvisy-pattern/src/shipped/mod.rs @@ -2,10 +2,10 @@ //! crate. //! //! Each accessor parses an asset file embedded via -//! [`include_bytes!`] and returns a fresh [`Regex`] or -//! [`Dictionary`]. Metadata for dictionaries (entity kind, score, -//! context) is split into a JSON sidecar paired with a CSV / TXT -//! term source; regex rules are self-contained JSON. +//! [`include_str!`] and returns a fresh [`Regex`] or +//! [`Dictionary`]. Metadata for dictionaries (entity label, score, +//! context) is split into a TOML sidecar paired with a CSV / TXT +//! term source; regex rules are self-contained TOML. //! //! Use [`patterns::all`] and [`dictionaries::all`] to load the //! complete shipped set, or pick individual accessors. diff --git a/crates/nvisy-pattern/src/shipped/patterns.rs b/crates/nvisy-pattern/src/shipped/patterns.rs index 09b6a36c..f9fc4140 100644 --- a/crates/nvisy-pattern/src/shipped/patterns.rs +++ b/crates/nvisy-pattern/src/shipped/patterns.rs @@ -119,7 +119,7 @@ shipped_pattern!( fn datetime from "personal/datetime.toml" ); -/// Every built-in regex pattern shipped by this crate, in arbitrary +/// Every built-in pattern shipped by this crate, in arbitrary /// stable order. #[must_use] pub fn all() -> Vec { diff --git a/crates/nvisy-pattern/src/validators/date.rs b/crates/nvisy-pattern/src/validators/date.rs index 69a38f4f..bcf30246 100644 --- a/crates/nvisy-pattern/src/validators/date.rs +++ b/crates/nvisy-pattern/src/validators/date.rs @@ -3,13 +3,12 @@ //! Validates that a regex-matched date string represents a real calendar //! date. Supports multiple common formats. -/// Validate a date string in common formats. +/// Return `true` if `value` is a real calendar date in one of the +/// supported written formats. /// /// Supported: `MM/DD/YYYY`, `DD/MM/YYYY`, `YYYY-MM-DD`, `YYYY/MM/DD` -/// (with `/` or `-` separators). -/// -/// Checks that the date is a real calendar date (accounts for leap years) -/// and that the year is in 1900:2100. +/// (with `/` or `-` separators). Leap years are honoured and the +/// year must fall in `1900..=2100`. /// /// # Ambiguity /// @@ -18,7 +17,7 @@ /// back to `DD/MM/YYYY` if the first part is not a valid month. This /// is a format-level structural check — locale disambiguation is out /// of scope. -pub fn validate_date(value: &str) -> bool { +pub fn date(value: &str) -> bool { let parts: Vec<&str> = value.split(['/', '-']).collect(); if parts.len() != 3 { return false; @@ -94,47 +93,47 @@ mod tests { #[test] fn mm_dd_yyyy() { - assert!(validate_date("01/15/1990")); - assert!(validate_date("12-31-2000")); + assert!(date("01/15/1990")); + assert!(date("12-31-2000")); } #[test] fn yyyy_mm_dd() { - assert!(validate_date("1990-01-15")); - assert!(validate_date("2000/12/31")); + assert!(date("1990-01-15")); + assert!(date("2000/12/31")); } #[test] fn leap_year() { - assert!(validate_date("02/29/2000")); - assert!(validate_date("2000-02-29")); - assert!(!validate_date("02/29/2001")); + assert!(date("02/29/2000")); + assert!(date("2000-02-29")); + assert!(!date("02/29/2001")); } #[test] fn invalid_day() { - assert!(!validate_date("04/31/1990")); - assert!(!validate_date("01/32/1990")); - assert!(!validate_date("01/00/1990")); + assert!(!date("04/31/1990")); + assert!(!date("01/32/1990")); + assert!(!date("01/00/1990")); } #[test] fn invalid_month() { // 13/01/1990 is valid as DD/MM/YYYY (Jan 13) - assert!(validate_date("13/01/1990")); + assert!(date("13/01/1990")); // YYYY-MM-DD format: month 13 is invalid - assert!(!validate_date("1990-13-01")); + assert!(!date("1990-13-01")); } #[test] fn invalid_year() { - assert!(!validate_date("01/01/1899")); - assert!(!validate_date("1899-01-01")); + assert!(!date("01/01/1899")); + assert!(!date("1899-01-01")); } #[test] fn dd_mm_yyyy_ambiguous() { // 15/01/1990: first part > 12 so must be DD/MM - assert!(validate_date("15/01/1990")); + assert!(date("15/01/1990")); } } diff --git a/crates/nvisy-pattern/src/validators/iban.rs b/crates/nvisy-pattern/src/validators/iban.rs index 52faf668..0df4a542 100644 --- a/crates/nvisy-pattern/src/validators/iban.rs +++ b/crates/nvisy-pattern/src/validators/iban.rs @@ -7,7 +7,7 @@ /// Return `true` if `value` passes the ISO 13616 mod-97 IBAN check. /// /// Whitespace and dashes are stripped before validation. -pub fn validate_iban(value: &str) -> bool { +pub fn iban(value: &str) -> bool { let cleaned: String = value .chars() .filter(|c| !c.is_ascii_whitespace() && *c != '-') @@ -53,31 +53,31 @@ mod tests { #[test] fn valid_ibans() { // GB, DE, FR examples from Wikipedia. - assert!(validate_iban("GB29 NWBK 6016 1331 9268 19")); - assert!(validate_iban("DE89370400440532013000")); - assert!(validate_iban("FR76 3000 6000 0112 3456 7890 189")); + assert!(iban("GB29 NWBK 6016 1331 9268 19")); + assert!(iban("DE89370400440532013000")); + assert!(iban("FR76 3000 6000 0112 3456 7890 189")); } #[test] fn invalid_check_digits() { - assert!(!validate_iban("GB29 NWBK 6016 1331 9268 18")); - assert!(!validate_iban("DE00370400440532013000")); + assert!(!iban("GB29 NWBK 6016 1331 9268 18")); + assert!(!iban("DE00370400440532013000")); } #[test] fn too_short() { - assert!(!validate_iban("GB29")); - assert!(!validate_iban("")); + assert!(!iban("GB29")); + assert!(!iban("")); } #[test] fn non_alphanumeric() { - assert!(!validate_iban("GB29!NWBK60161331926819")); + assert!(!iban("GB29!NWBK60161331926819")); } #[test] fn strips_whitespace_and_dashes() { - assert!(validate_iban("GB29-NWBK-6016-1331-9268-19")); - assert!(validate_iban(" GB29 NWBK 6016 1331 9268 19 ")); + assert!(iban("GB29-NWBK-6016-1331-9268-19")); + assert!(iban(" GB29 NWBK 6016 1331 9268 19 ")); } } diff --git a/crates/nvisy-pattern/src/validators/luhn.rs b/crates/nvisy-pattern/src/validators/luhn.rs index 88cb5146..40bb5bc0 100644 --- a/crates/nvisy-pattern/src/validators/luhn.rs +++ b/crates/nvisy-pattern/src/validators/luhn.rs @@ -15,7 +15,7 @@ /// /// Returns `false` if the input is empty or contains characters other /// than digits, spaces, and dashes. -pub fn luhn_check(num: &str) -> bool { +pub fn luhn(num: &str) -> bool { if num.is_empty() { return false; } @@ -56,41 +56,41 @@ mod tests { #[test] fn valid_card_numbers() { - assert!(luhn_check("4539 1488 0343 6467")); - assert!(luhn_check("4539148803436467")); - assert!(luhn_check("4539-1488-0343-6467")); + assert!(luhn("4539 1488 0343 6467")); + assert!(luhn("4539148803436467")); + assert!(luhn("4539-1488-0343-6467")); } #[test] fn invalid_card_numbers() { - assert!(!luhn_check("4539 1488 0343 6466")); - assert!(!luhn_check("1234567890123456")); + assert!(!luhn("4539 1488 0343 6466")); + assert!(!luhn("1234567890123456")); } #[test] fn empty_input() { - assert!(!luhn_check("")); + assert!(!luhn("")); } #[test] fn non_digit_input() { - assert!(!luhn_check("abcdef")); + assert!(!luhn("abcdef")); } #[test] fn mixed_alpha_digit_rejected() { - assert!(!luhn_check("45abc39")); - assert!(!luhn_check("4539 14X8 0343 6467")); + assert!(!luhn("45abc39")); + assert!(!luhn("4539 14X8 0343 6467")); } #[test] fn single_zero() { - assert!(luhn_check("0")); + assert!(luhn("0")); } #[test] fn only_separators_rejected() { - assert!(!luhn_check(" ")); - assert!(!luhn_check("---")); + assert!(!luhn(" ")); + assert!(!luhn("---")); } } diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index 991f8422..f384b762 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -1,9 +1,10 @@ //! Post-match validators for detected entity values. //! -//! A [`Regex`] can reference a validator by name (e.g. -//! `validator: Some("luhn")`) to reduce false positives. At -//! [`PatternRecognizer::build`] time the name is resolved against a -//! [`ValidatorRegistry`] to a concrete validation function. +//! A [`Variant`] inside a [`Regex`] rule can reference a validator +//! by name (e.g. `validator: Some("luhn")`) to reduce false +//! positives. At [`PatternRecognizer::build`] time the name is +//! resolved against a [`ValidatorRegistry`] to a concrete +//! validation function. //! //! The default [`ValidatorRegistry::builtin`] ships with five //! validators — `luhn`, `iban`, `ssn`, `phone`, `date`. Consumers @@ -11,6 +12,7 @@ //! [`ValidatorRegistry::with`] before handing it to the recognizer //! builder. //! +//! [`Variant`]: crate::Variant //! [`Regex`]: crate::Regex //! [`PatternRecognizer::build`]: crate::PatternRecognizer @@ -20,16 +22,16 @@ mod luhn; mod phone; mod ssn; +pub use self::date::date; +pub use self::iban::iban; +pub use self::luhn::luhn; +pub use self::phone::phone; +pub use self::ssn::ssn; + use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; -use self::date::validate_date; -use self::iban::validate_iban; -use self::luhn::luhn_check; -use self::phone::validate_phone; -use self::ssn::validate_ssn; - /// Post-match validator: returns `true` when `matched` passes the /// validator's check. /// @@ -51,15 +53,15 @@ where } } -/// Resolves validator names referenced in [`Regex`] definitions to -/// concrete [`Validator`] implementations. +/// Resolves validator names referenced in [`Variant`] definitions +/// to concrete [`Validator`] implementations. /// /// Keys are [`Cow<'static, str>`] so the built-in registrations skip /// any allocation (`&'static str` literal → borrowed variant) while /// caller-supplied names that aren't `'static` (e.g. dynamically /// constructed at runtime) still flow through as owned `String`s. /// -/// [`Regex`]: crate::Regex +/// [`Variant`]: crate::Variant #[derive(Clone, Default)] pub struct ValidatorRegistry { table: HashMap, Arc>, @@ -74,16 +76,18 @@ impl ValidatorRegistry { Self::default() } - /// Registry pre-loaded with every built-in validator: `luhn`, - /// `iban`, `ssn`, `phone`, `date`. + /// Registry pre-loaded with every built-in validator: [`luhn`], + /// [`iban`], [`ssn`], [`phone`], [`date`]. Each is also + /// re-exported individually from this module so consumers can + /// mix-and-match without taking all five. #[must_use] pub fn builtin() -> Self { Self::empty() - .with("luhn", luhn_check) - .with("iban", validate_iban) - .with("ssn", validate_ssn) - .with("phone", validate_phone) - .with("date", validate_date) + .with("luhn", luhn) + .with("iban", iban) + .with("ssn", ssn) + .with("phone", phone) + .with("date", date) } /// Register `validator` under `name`. Overwrites any previous diff --git a/crates/nvisy-pattern/src/validators/phone.rs b/crates/nvisy-pattern/src/validators/phone.rs index cf7ed377..d503ba1f 100644 --- a/crates/nvisy-pattern/src/validators/phone.rs +++ b/crates/nvisy-pattern/src/validators/phone.rs @@ -3,15 +3,16 @@ //! Validates that a regex-matched phone number has a plausible structure: //! correct digit count and no obviously invalid prefixes. -/// Validate a phone number matched by regex. +/// Return `true` if `value` has a plausible phone-number structure. /// /// Strips all non-digit characters, then checks: +/// /// - 7 to 15 digits (ITU-T E.164 range) /// - When the original begins with `+` (explicit E.164), the digits /// must not start with 0 (no country code is `0…`). National formats /// such as UK `020 7946 0958` keep their trunk-prefix zero and remain /// valid. -pub fn validate_phone(value: &str) -> bool { +pub fn phone(value: &str) -> bool { let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); let len = digits.len(); @@ -32,44 +33,44 @@ mod tests { #[test] fn valid_us_numbers() { - assert!(validate_phone("+1-555-123-4567")); - assert!(validate_phone("(555) 123-4567")); - assert!(validate_phone("555.123.4567")); - assert!(validate_phone("5551234567")); + assert!(phone("+1-555-123-4567")); + assert!(phone("(555) 123-4567")); + assert!(phone("555.123.4567")); + assert!(phone("5551234567")); } #[test] fn valid_international() { - assert!(validate_phone("+44 20 7946 0958")); - assert!(validate_phone("+49 30 12345678")); - assert!(validate_phone("+81 3 1234 5678")); + assert!(phone("+44 20 7946 0958")); + assert!(phone("+49 30 12345678")); + assert!(phone("+81 3 1234 5678")); } #[test] fn too_few_digits() { - assert!(!validate_phone("12345")); - assert!(!validate_phone("123-45")); + assert!(!phone("12345")); + assert!(!phone("123-45")); } #[test] fn too_many_digits() { - assert!(!validate_phone("1234567890123456")); + assert!(!phone("1234567890123456")); } #[test] fn e164_starting_with_zero_rejected() { - assert!(!validate_phone("+0123456789012")); + assert!(!phone("+0123456789012")); } #[test] fn national_format_with_trunk_zero_accepted() { // UK national format keeps the leading 0 trunk prefix. - assert!(validate_phone("020 7946 0958")); - assert!(validate_phone("0207946 0958")); + assert!(phone("020 7946 0958")); + assert!(phone("0207946 0958")); } #[test] fn local_number_with_seven_digits() { - assert!(validate_phone("123-4567")); + assert!(phone("123-4567")); } } diff --git a/crates/nvisy-pattern/src/validators/ssn.rs b/crates/nvisy-pattern/src/validators/ssn.rs index 6732a0d4..46258064 100644 --- a/crates/nvisy-pattern/src/validators/ssn.rs +++ b/crates/nvisy-pattern/src/validators/ssn.rs @@ -10,7 +10,7 @@ /// format. /// /// This is a format check, not a verification against SSA records. -pub fn validate_ssn(value: &str) -> bool { +pub fn ssn(value: &str) -> bool { let parts: Vec<&str> = value.split('-').collect(); if parts.len() != 3 { return false; @@ -36,42 +36,42 @@ mod tests { #[test] fn valid() { - assert!(validate_ssn("123-45-6789")); - assert!(validate_ssn("001-01-0001")); - assert!(validate_ssn("899-99-9999")); + assert!(ssn("123-45-6789")); + assert!(ssn("001-01-0001")); + assert!(ssn("899-99-9999")); } #[test] fn invalid_area_zero() { - assert!(!validate_ssn("000-45-6789")); + assert!(!ssn("000-45-6789")); } #[test] fn invalid_area_666() { - assert!(!validate_ssn("666-45-6789")); + assert!(!ssn("666-45-6789")); } #[test] fn invalid_area_900_plus() { - assert!(!validate_ssn("900-45-6789")); - assert!(!validate_ssn("999-45-6789")); + assert!(!ssn("900-45-6789")); + assert!(!ssn("999-45-6789")); } #[test] fn invalid_group_zero() { - assert!(!validate_ssn("123-00-6789")); + assert!(!ssn("123-00-6789")); } #[test] fn invalid_serial_zero() { - assert!(!validate_ssn("123-45-0000")); + assert!(!ssn("123-45-0000")); } #[test] fn wrong_format() { - assert!(!validate_ssn("12345-6789")); - assert!(!validate_ssn("123456789")); - assert!(!validate_ssn("abc-de-fghi")); - assert!(!validate_ssn("")); + assert!(!ssn("12345-6789")); + assert!(!ssn("123456789")); + assert!(!ssn("abc-de-fghi")); + assert!(!ssn("")); } } diff --git a/crates/nvisy-pattern/testdata/patterns/employee_id.toml b/crates/nvisy-pattern/testdata/patterns/employee_id.toml index 19949064..b959d33e 100644 --- a/crates/nvisy-pattern/testdata/patterns/employee_id.toml +++ b/crates/nvisy-pattern/testdata/patterns/employee_id.toml @@ -1,4 +1,6 @@ name = "internal-employee-id" label = "internal_id" + +[[variants]] regex = "\\bEMP-\\d{5}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/testdata/patterns/product_codes.toml b/crates/nvisy-pattern/testdata/patterns/product_codes.toml index 29f74865..87e7941b 100644 --- a/crates/nvisy-pattern/testdata/patterns/product_codes.toml +++ b/crates/nvisy-pattern/testdata/patterns/product_codes.toml @@ -1,4 +1,6 @@ name = "internal-product-code" label = "internal_id" + +[[variants]] regex = "\\b(?:WIDGET-\\d{3}|SPROCKET-\\d{2}|GADGET-X\\d)\\b" score = 0.9 diff --git a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs index dbc09cbc..6c1fca87 100644 --- a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs +++ b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs @@ -1,103 +1,77 @@ -//! End-to-end: feed real input through the -//! recognizer → [`ContextEnhancer`] handoff, and verify that -//! confidence is boosted, the recognition step's `contextual` flag is -//! set, and a [`Refinement`] -//! step is appended only for matches that had a nearby keyword. +//! End-to-end: feed real input through a [`Regex`] → +//! [`PatternRecognizer`] (wrapped in [`Boosting`]) and verify +//! that confidence is boosted, and a [`Refinement`] step is +//! appended only for matches that had a nearby keyword. //! //! [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement +//! [`Boosting`]: nvisy_context::Boosting -use nvisy_context::{Context, ContextEnhancer}; -use nvisy_core::entity::{PatternProvenance, TrailProvenance, TrailStepKind, builtins}; -use nvisy_core::extraction::Artifacts; +use nvisy_core::entity::{TrailStepKind, builtins}; use nvisy_core::modality::TextData; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::{PatternRecognizer, PatternRegistry, Regex}; +use nvisy_pattern::{Regex, PatternRecognizer, Variant}; #[tokio::test] async fn enhancer_boosts_matches_near_keyword_only() { - let ssn = Regex::builder() - .with_name("ssn") - .with_label(builtins::GOVERNMENT_ID.label_ref()) + let variant = Variant::builder() .with_regex(r"\b\d{3}-\d{2}-\d{4}\b") .with_score(Confidence::clamped(0.6)) - .with_context(Context::new(["ssn", "social security"])) + .build() + .expect("ssn variant builds"); + let regex = Regex::builder() + .with_name("ssn") + .with_label(builtins::GOVERNMENT_ID.label_ref()) + .with_context(vec!["ssn".to_owned(), "social security".to_owned()]) + .with_variants(vec![variant]) .build() .expect("ssn regex builds"); - let registry = PatternRegistry::new().with_pattern(ssn); let recognizer = PatternRecognizer::builder() - .with_registry(registry.clone()) + .with_pattern(regex) .build() .expect("recognizer builds"); // Two SSN-shaped numbers: one near the keyword, one not. let text = "First SSN: 123-45-6789. Unrelated number 987-65-4329 elsewhere."; let input = RecognizerInput::new(TextData::new(text.to_owned())); - let mut entities = recognizer + let entities = recognizer .recognize(&input) .await .expect("recognize") .entities; assert_eq!(entities.len(), 2, "two SSN matches expected"); - // Snapshot base confidences keyed by match text so we can compare - // before vs after. - let mut before: std::collections::HashMap = std::collections::HashMap::new(); - for e in &entities { - before.insert( - text[e.location.start..e.location.end].to_owned(), - e.confidence.get(), - ); - } - - let enhancer = ContextEnhancer::builder() - .with_registry(registry.context_registry()) - .with_default_window(20) - .with_default_boost(0.3) - .build() - .expect("enhancer builds"); - enhancer.enhance(&mut entities, text, &Artifacts::new()); - - // First match has `SSN:` within the 20-byte window → boosted. + // First match has `SSN:` within the default 5-word prefix/suffix + // window and gets boosted by the Boosting wrapper. let near = entities .iter() .find(|e| &text[e.location.start..e.location.end] == "123-45-6789") .expect("near match present"); assert!( - near.confidence.get() > before["123-45-6789"], - "near-keyword match should be boosted" + near.confidence.get() > 0.6, + "near-keyword match should be boosted", ); assert!( near.trail .iter() .any(|s| matches!(s.kind, TrailStepKind::Refinement)), - "near-keyword match should have a Refinement step" - ); - let TrailProvenance::Pattern(PatternProvenance::Regex { contextual, .. }) = - &near.trail[0].provenance - else { - panic!("expected regex provenance on recognition step"); - }; - assert!( - *contextual, - "contextual flag should be set on recognition step" + "near-keyword match should have a Refinement step", ); - // Second match is well outside the 20-byte window → untouched. + // Second match is well outside the window → untouched. let far = entities .iter() .find(|e| &text[e.location.start..e.location.end] == "987-65-4329") .expect("far match present"); - assert_eq!( - far.confidence.get(), - before["987-65-4329"], - "far-from-keyword match should not be boosted" + assert!( + (far.confidence.get() - 0.6).abs() < f64::EPSILON, + "far-from-keyword match should not be boosted", ); assert!( !far.trail .iter() .any(|s| matches!(s.kind, TrailStepKind::Refinement)), - "far-from-keyword match should have no Refinement step" + "far-from-keyword match should have no Refinement step", ); } diff --git a/crates/nvisy-pattern/tests/shipped_detection.rs b/crates/nvisy-pattern/tests/shipped_detection.rs index c6374987..1f6a30f9 100644 --- a/crates/nvisy-pattern/tests/shipped_detection.rs +++ b/crates/nvisy-pattern/tests/shipped_detection.rs @@ -10,17 +10,14 @@ use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::{Text, TextData}; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::{PatternRecognizer, PatternRegistry}; - -fn shipped_recognizer() -> PatternRecognizer { - PatternRecognizer::builder() - .with_registry(PatternRegistry::builtin()) - .build() - .expect("shipped recognizer builds") -} +use nvisy_pattern::PatternRecognizer; async fn scan(text: &str) -> (String, Vec>) { - let recognizer = shipped_recognizer(); + let recognizer = PatternRecognizer::builder() + .with_builtin_patterns() + .with_builtin_dictionaries() + .build() + .expect("shipped recognizer builds"); let input = RecognizerInput::new(TextData::new(text.to_owned())); let entities = recognizer .recognize(&input) diff --git a/crates/nvisy-pattern/tests/user_rules.rs b/crates/nvisy-pattern/tests/user_rules.rs index 6e78c8ac..38dcee10 100644 --- a/crates/nvisy-pattern/tests/user_rules.rs +++ b/crates/nvisy-pattern/tests/user_rules.rs @@ -1,20 +1,21 @@ -//! End-to-end: load user-supplied rules from the on-disk wire shape -//! (`testdata/patterns/*.toml`, `testdata/dictionaries/*.{toml,csv}`) -//! through [`Regex::from_toml`], [`Dictionary::metadata_from_toml`], -//! and [`Terms::from_csv`], mix them with shipped patterns, and +//! End-to-end: load user-supplied patterns from the on-disk wire +//! shape (`testdata/patterns/*.toml`, +//! `testdata/dictionaries/*.{toml,csv}`) through +//! [`Regex::from_toml`], [`Dictionary::metadata_from_toml`], and +//! [`Terms::from_csv`], mix them with shipped patterns, and //! confirm a real internal-handoff document yields the custom //! entities. use nvisy_core::entity::builtins; use nvisy_core::modality::TextData; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::{Dictionary, PatternRecognizer, PatternRegistry, Regex, Terms}; +use nvisy_pattern::{Dictionary, Regex, PatternRecognizer, Terms}; #[tokio::test] async fn user_toml_rules_load_and_detect() { let employee_id = Regex::from_toml(include_str!("../testdata/patterns/employee_id.toml")) .expect("employee_id.toml parses"); - let product_code_regex = + let product_code_pattern = Regex::from_toml(include_str!("../testdata/patterns/product_codes.toml")) .expect("product_codes.toml parses"); @@ -30,15 +31,12 @@ async fn user_toml_rules_load_and_detect() { // 4 rows × 3 columns; every non-empty cell becomes a term. assert_eq!(product_code_dict.terms.len(), 12); - // Mix user rules with shipped (so the input also sees email etc.). - let registry = PatternRegistry::new() + // Mix user patterns with shipped (so the input also sees email etc.). + let recognizer = PatternRecognizer::builder() .with_pattern(employee_id) - .with_pattern(product_code_regex) + .with_pattern(product_code_pattern) .with_dictionary(product_code_dict) - .with_builtin_patterns(); - - let recognizer = PatternRecognizer::builder() - .with_registry(registry) + .with_builtin_patterns() .build() .expect("recognizer builds"); @@ -82,12 +80,12 @@ async fn user_toml_rules_load_and_detect() { "expected dictionary alias/full-name hit, got {emp_hits:?}" ); - // Shipped email pattern fires too — proves user + shipped coexist. + // Shipped email regex fires too — proves user + shipped coexist. assert!( entities .iter() .any(|e| e.label == builtins::EMAIL_ADDRESS.label_ref() && &text[e.location.start..e.location.end] == "counsel@example.com"), - "expected shipped email pattern to fire alongside user rules" + "expected shipped email regex to fire alongside user rules" ); } diff --git a/crates/nvisy-toolkit/Cargo.toml b/crates/nvisy-toolkit/Cargo.toml index 88d17f47..3a485c61 100644 --- a/crates/nvisy-toolkit/Cargo.toml +++ b/crates/nvisy-toolkit/Cargo.toml @@ -80,6 +80,8 @@ unicode-normalization = { workspace = true, features = [] } [dev-dependencies] # Internal test utilities (Entity::test_builder, …). nvisy-core = { workspace = true, features = ["test-utils"] } +# Boosting wrapper type returned by PatternRecognizer::build(). +nvisy-context = { workspace = true, features = [] } # Codec front door for E2E pipeline tests. Production builds pull # nvisy-codec via the toolkit's per-modality features; the dev # entry pins txt/csv/json so the pipeline_*.rs tests compile. diff --git a/crates/nvisy-toolkit/examples/pipeline.rs b/crates/nvisy-toolkit/examples/pipeline.rs index 9a6a0656..d4703f4f 100644 --- a/crates/nvisy-toolkit/examples/pipeline.rs +++ b/crates/nvisy-toolkit/examples/pipeline.rs @@ -26,7 +26,7 @@ use nvisy_core::modality::{Text, TextData}; use nvisy_core::primitive::ConfidenceThreshold; use nvisy_core::recognition::RecognizerInput; use nvisy_core::redaction::RedactAt; -use nvisy_pattern::{PatternRecognizer, PatternRegistry}; +use nvisy_pattern::PatternRecognizer; use nvisy_toolkit::deduplication::{LayerContext, LayerParams, LayerPipeline}; use nvisy_toolkit::detection::RecognizerRegistry; use nvisy_toolkit::redaction::RedactionRegistry; @@ -54,7 +54,8 @@ async fn main() -> Result<()> { // services. Add NER / LLM recognizers with extra // `.with_recognizer(...)` calls. let pattern = PatternRecognizer::builder() - .with_registry(PatternRegistry::builtin()) + .with_builtin_patterns() + .with_builtin_dictionaries() .build()?; let detection = RecognizerRegistry::new().with_recognizer(pattern); diff --git a/crates/nvisy-toolkit/src/redaction/deanonymizer/mod.rs b/crates/nvisy-toolkit/src/redaction/deanonymizer/mod.rs index 3ea2731b..733c0664 100644 --- a/crates/nvisy-toolkit/src/redaction/deanonymizer/mod.rs +++ b/crates/nvisy-toolkit/src/redaction/deanonymizer/mod.rs @@ -3,7 +3,8 @@ //! //! Each operator recovers the original payload an [`Anonymizer`] //! wrote. Two recovery shapes ship today (see [`Deanonymizer`]): -//! audit-keyed (no impl yet) and self-contained ([`Decrypt`]). +//! audit-keyed (no impl yet) and self-contained (e.g. `Decrypt`, +//! gated behind the `encrypt` feature). //! //! [`Anonymizer`]: crate::redaction::Anonymizer //! [`Deanonymizer`]: crate::redaction::Deanonymizer diff --git a/crates/nvisy-toolkit/src/redaction/mod.rs b/crates/nvisy-toolkit/src/redaction/mod.rs index f5fbb9e3..d8ec4a2a 100644 --- a/crates/nvisy-toolkit/src/redaction/mod.rs +++ b/crates/nvisy-toolkit/src/redaction/mod.rs @@ -26,12 +26,10 @@ //! built-in (constructed inline) or a `Custom(AnonymizerId)` //! (looked up in the registry). //! -//! [`Replace`]: anonymizer::Replace -//! [`Mask`]: anonymizer::Mask -//! [`Hash`]: anonymizer::Hash -//! [`Redact`]: anonymizer::Redact -//! [`Keep`]: anonymizer::Keep -//! [`Encrypt`]: anonymizer::Encrypt +//! [`Anonymizer`]: Anonymizer +//! [`Deanonymizer`]: Deanonymizer +//! [`AnonymizerId`]: AnonymizerId +//! [`RedactionRegistry`]: RedactionRegistry mod id; mod registry; diff --git a/crates/nvisy-toolkit/tests/fixtures/registries.rs b/crates/nvisy-toolkit/tests/fixtures/registries.rs index 08657054..3c77d299 100644 --- a/crates/nvisy-toolkit/tests/fixtures/registries.rs +++ b/crates/nvisy-toolkit/tests/fixtures/registries.rs @@ -1,18 +1,21 @@ //! Shared recognizer + redaction registry constructors and dedup //! params used by every codec E2E test. +use nvisy_context::Boosting; use nvisy_core::entity::builtins; use nvisy_core::modality::Modality; use nvisy_core::primitive::ConfidenceThreshold; -use nvisy_pattern::{PatternRecognizer, PatternRegistry}; +use nvisy_pattern::PatternRecognizer; use nvisy_toolkit::deduplication::LayerParams; use nvisy_toolkit::redaction::anonymizer::{Mask, Replace}; use nvisy_toolkit::redaction::{Anonymizer, RedactionRegistry}; -/// Build the shipped pattern recognizer from every built-in pattern. -pub fn shipped_recognizer() -> PatternRecognizer { +/// Build the shipped pattern recognizer from every built-in +/// pattern + dictionary, wrapped in its [`Boosting`] layer. +pub fn shipped_recognizer() -> Boosting { PatternRecognizer::builder() - .with_registry(PatternRegistry::builtin()) + .with_builtin_patterns() + .with_builtin_dictionaries() .build() .expect("shipped recognizer builds") } diff --git a/crates/nvisy-toolkit/tests/recognition_registry.rs b/crates/nvisy-toolkit/tests/recognition_registry.rs index 1556e2d8..80460d41 100644 --- a/crates/nvisy-toolkit/tests/recognition_registry.rs +++ b/crates/nvisy-toolkit/tests/recognition_registry.rs @@ -30,7 +30,7 @@ use nvisy_llm::provider::LlmProvider; use nvisy_llm::{DefaultPrompt, LlmRecognizer}; use nvisy_ner::NerRecognizer; use nvisy_ner::backend::{BentoBackend, BentoParams}; -use nvisy_pattern::{PatternRecognizer, PatternRegistry}; +use nvisy_pattern::PatternRecognizer; use nvisy_toolkit::detection::RecognizerRegistry; /// Sample text that triggers all three recognizers: @@ -46,9 +46,10 @@ fn env_or(key: &str, default: &str) -> String { fn build_registry() -> RecognizerRegistry { let pattern = PatternRecognizer::builder() - .with_registry(PatternRegistry::builtin()) + .with_builtin_patterns() + .with_builtin_dictionaries() .build() - .expect("pattern recognizer builds from builtin registry"); + .expect("pattern recognizer builds from builtin set"); let bento_url = env_or("NVISY_BENTO_URL", "http://localhost:3000"); let bento_backend = BentoBackend::new(BentoParams::new(bento_url)).expect("bento backend init"); From 628227cfc4c856ba77fcd1e8373628cb97d68971 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 22:43:09 +0200 Subject: [PATCH 05/14] refactor(pattern): inline VariantBuilder, drop Terms, normalize all docs - Variant: replace derive_builder with `new(regex)?` + `with_score` / `with_validator` chain (matches Term::new style). - Drop the Terms newtype; Dictionary::terms is `Vec` and the parsers move to associated fns on Term: - Term::from_text(&str) -> Vec (infallible) - Term::from_csv(&str) -> Result, Error> Signatures now match Regex::from_toml / Dictionary::from_toml. - Rewrite every public-item docblock in nvisy-pattern for a consistent style: noun-phrase openers for types, imperative for constructors/setters, returns-form for predicates, reference-form doc-links at the bottom, `# Errors` only where fallible, code examples on top-level types. Co-Authored-By: Claude Opus 4.7 --- crates/nvisy-context/src/enhancer.rs | 16 +- crates/nvisy-context/src/matcher.rs | 6 +- crates/nvisy-context/src/wrapper.rs | 3 +- crates/nvisy-pattern/src/lib.rs | 2 +- .../src/recognition/dictionary.rs | 146 ++++++------ crates/nvisy-pattern/src/recognition/mod.rs | 21 +- .../src/recognition/recognizer.rs | 104 +++++---- crates/nvisy-pattern/src/recognition/regex.rs | 157 +++++++------ crates/nvisy-pattern/src/recognition/term.rs | 104 +++++++++ crates/nvisy-pattern/src/recognition/terms.rs | 208 ------------------ .../nvisy-pattern/src/shipped/dictionaries.rs | 17 +- crates/nvisy-pattern/src/shipped/mod.rs | 13 +- crates/nvisy-pattern/src/validators/date.rs | 20 +- crates/nvisy-pattern/src/validators/iban.rs | 14 +- crates/nvisy-pattern/src/validators/luhn.rs | 20 +- crates/nvisy-pattern/src/validators/mod.rs | 92 ++++---- crates/nvisy-pattern/src/validators/phone.rs | 17 +- crates/nvisy-pattern/src/validators/ssn.rs | 19 +- .../nvisy-pattern/tests/enhancer_roundtrip.rs | 10 +- crates/nvisy-pattern/tests/user_rules.rs | 6 +- 20 files changed, 445 insertions(+), 550 deletions(-) create mode 100644 crates/nvisy-pattern/src/recognition/term.rs delete mode 100644 crates/nvisy-pattern/src/recognition/terms.rs diff --git a/crates/nvisy-context/src/enhancer.rs b/crates/nvisy-context/src/enhancer.rs index f1eba2df..cd3103b1 100644 --- a/crates/nvisy-context/src/enhancer.rs +++ b/crates/nvisy-context/src/enhancer.rs @@ -475,13 +475,7 @@ mod tests { // keyword "social security" must NOT fire — unlike a // hypothetical caller that gave it the word-window path, // which would split on whitespace. - let enhancer = enhancer(vec![rule( - govid_label(), - &["social security"], - 1, - 0, - 0.2, - )]); + let enhancer = enhancer(vec![rule(govid_label(), &["social security"], 1, 0, 0.2)]); let text = "social security: Your 123-45-6789"; let entity_start = text.find("123").unwrap(); let entity_end = entity_start + "123-45-6789".len(); @@ -504,13 +498,7 @@ mod tests { fn token_path_boosts_when_keyword_within_token_window() { // Same tokens, 2-word prefix: now the `social security` // token is reachable and the boost fires. - let enhancer = enhancer(vec![rule( - govid_label(), - &["social security"], - 2, - 0, - 0.2, - )]); + let enhancer = enhancer(vec![rule(govid_label(), &["social security"], 2, 0, 0.2)]); let text = "social security: Your 123-45-6789"; let entity_start = text.find("123").unwrap(); let entity_end = entity_start + "123-45-6789".len(); diff --git a/crates/nvisy-context/src/matcher.rs b/crates/nvisy-context/src/matcher.rs index a2cdb3c3..06beef22 100644 --- a/crates/nvisy-context/src/matcher.rs +++ b/crates/nvisy-context/src/matcher.rs @@ -99,7 +99,11 @@ mod tests { fn substring_matches_case_insensitively() { let m = SubstringMatcher; assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); - assert!(m.any_match("the SOCIAL SECURITY number", &[], &kws(&["social security"]))); + assert!(m.any_match( + "the SOCIAL SECURITY number", + &[], + &kws(&["social security"]) + )); assert!(!m.any_match("nothing here", &[], &kws(&["ssn"]))); } diff --git a/crates/nvisy-context/src/wrapper.rs b/crates/nvisy-context/src/wrapper.rs index 688f2838..87105b19 100644 --- a/crates/nvisy-context/src/wrapper.rs +++ b/crates/nvisy-context/src/wrapper.rs @@ -22,8 +22,7 @@ use nvisy_core::Result; use nvisy_core::modality::Text; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; -use super::Enhancer; -use super::Tokens; +use super::{Enhancer, Tokens}; /// Wraps an [`EntityRecognizer`] with a post-recognition /// [`Enhancer`] pass. Implements [`EntityRecognizer`] so diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index ed069016..129b002f 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -8,6 +8,6 @@ pub mod validators; pub use self::recognition::{ Dictionary, DictionaryBuilder, PatternRecognizer, PatternRecognizerBuilder, Regex, - RegexBuilder, Scoring, Term, Terms, Variant, VariantBuilder, + RegexBuilder, Scoring, Term, Variant, }; pub use self::shipped::{dictionaries, patterns}; diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index bf20a1e0..3285f7af 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -1,22 +1,4 @@ //! [`Dictionary`]: literal-term detection rule. -//! -//! A dictionary scans for a fixed list of literal strings using an -//! Aho-Corasick automaton. Compared with [`Regex`], a dictionary -//! has no regex engine, no validator, and a single shared confidence -//! score applied to every match. -//! -//! Construct via [`Dictionary::builder`] for the chainable style or -//! [`Dictionary::from_toml`] for a self-contained TOML source. -//! -//! Term sources are first-class — see [`Terms`] for [`from_text`] -//! and [`from_csv`] constructors. The builder's [`with_terms`] -//! setter accepts anything convertible to [`Terms`]. -//! -//! [`Regex`]: crate::Regex -//! [`Terms`]: crate::Terms -//! [`from_text`]: crate::Terms::from_text -//! [`from_csv`]: crate::Terms::from_csv -//! [`with_terms`]: DictionaryBuilder::with_terms use derive_builder::Builder; use nvisy_core::Error; @@ -24,14 +6,14 @@ use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use serde::Deserialize; -use super::terms::Terms; +use super::term::Term; /// Confidence policy for a [`Dictionary`]'s matches. /// /// Either every term gets the same score ([`Uniform`]), or scores -/// are picked per CSV source column ([`PerColumn`]). The untagged -/// serde representation accepts a bare number for the uniform -/// case and an array for the per-column case: +/// vary by CSV source column ([`PerColumn`]). The untagged serde +/// representation accepts a bare number for the uniform case and +/// an array for the per-column case: /// /// ```toml /// score = 0.9 # Uniform @@ -43,24 +25,28 @@ use super::terms::Terms; #[derive(Debug, Clone, PartialEq, Deserialize)] #[serde(untagged)] pub enum Scoring { - /// Single confidence stamped on every match. The common case. + /// One confidence stamped on every match — the common case. Uniform(Confidence), - /// Per-column confidence vector. `[i]` is the confidence - /// stamped on every term whose source CSV column was `i`. A - /// term from a column past the end of this vec is a - /// recognizer-build error — define one score per column. + /// Per-column confidence vector. Entry `i` is the score for + /// terms loaded from CSV column `i`. A term from a column past + /// the end of this vector causes a recognizer-build error, so + /// callers must declare one score per source column. PerColumn(Vec), } impl Scoring { - /// Validate the policy's internal shape. A - /// `PerColumn(vec![])` can never resolve a score for any - /// column, so callers (the recognizer at build time) surface - /// it as a configuration error. + /// Return `Ok(())` when the policy can resolve a score for at + /// least one input. + /// + /// [`PerColumn`] with an empty vector can never resolve and is + /// rejected here; the recognizer surfaces the error at build + /// time. /// /// # Errors /// - /// Returns the human-readable reason the policy is invalid. + /// Returns a human-readable reason when the policy is invalid. + /// + /// [`PerColumn`]: Self::PerColumn pub fn validate(&self) -> Result<(), &'static str> { match self { Self::Uniform(_) => Ok(()), @@ -71,12 +57,15 @@ impl Scoring { } } - /// Resolve a score for `column`. `Uniform` ignores the column - /// and always returns its score; `PerColumn` returns the entry - /// at `column`, or `None` when no column is supplied or the - /// index is past the end of the per-column vector. Callers - /// decide the fall-back policy (per-term override, hard - /// error, default constant, etc.). + /// Resolve a score for the given source `column`. + /// + /// [`Uniform`] ignores `column` and always returns its score; + /// [`PerColumn`] returns the entry at `column`, or `None` when + /// `column` is `None` or out of range. Callers decide the + /// fall-back policy (per-term override, hard error, …). + /// + /// [`Uniform`]: Self::Uniform + /// [`PerColumn`]: Self::PerColumn #[must_use] pub fn get(&self, column: Option) -> Option { match self { @@ -94,17 +83,29 @@ impl Default for Scoring { /// Literal-term detection rule. /// +/// Scans for a fixed list of literals using a shared Aho-Corasick +/// automaton. Unlike [`Regex`], a dictionary has no regex engine, +/// no validator, and a [`Scoring`] policy shared across its terms. +/// +/// # Examples +/// /// ``` /// use nvisy_core::entity::builtins; -/// use nvisy_pattern::{Dictionary, Terms}; +/// use nvisy_pattern::{Dictionary, Term}; /// /// let dictionary = Dictionary::builder() /// .with_name("nationalities") /// .with_label(builtins::NATIONALITY.label_ref()) -/// .with_terms(Terms::from(["German", "French", "Italian"])) +/// .with_terms(vec![ +/// Term::new("German"), +/// Term::new("French"), +/// Term::new("Italian"), +/// ]) /// .build() /// .expect("nationalities dictionary builds"); /// ``` +/// +/// [`Regex`]: crate::Regex #[derive(Debug, Clone, PartialEq, Builder, Deserialize)] #[builder( name = "DictionaryBuilder", @@ -113,40 +114,39 @@ impl Default for Scoring { build_fn(error = "Error") )] pub struct Dictionary { - /// Human-readable identifier (e.g. `"nationalities"`). + /// Human-readable identifier surfaced in trail provenance + /// (e.g. `"nationalities"`). pub name: String, /// Entity label every match emits. pub label: EntityLabelRef, - /// Literal terms to scan for. The recognizer compiles these - /// into an Aho-Corasick automaton at build time. - pub terms: Terms, - /// Confidence policy: uniform across every term, or per CSV - /// source column. Defaults to [`Scoring::Uniform`] with - /// [`Confidence::MAX`]. + /// Literal terms to scan for. Compiled into the shared + /// Aho-Corasick automaton at recognizer-build time. + pub terms: Vec, + /// Confidence policy resolved against each term at + /// recognizer-build time. Defaults to [`Scoring::Uniform`] + /// with [`Confidence::MAX`]. #[builder(default)] #[serde(default, rename = "score")] pub scoring: Scoring, /// Context keywords that lift confidence when one of them - /// appears near a match. Harvested by the engine into a - /// per-label `BoostRule` in `nvisy-context`; the recognizer - /// itself never reads this field. + /// appears near a match. #[builder(default)] #[serde(default)] pub context: Vec, - /// Languages the dictionary applies to (BCP-47 tags). An empty - /// list (the default) means the dictionary applies regardless - /// of language; otherwise the recognizer skips this dictionary - /// when the per-call language hint is set to a tag not in this + /// BCP-47 language tags the dictionary applies to. Empty means + /// "any language"; otherwise the recognizer skips the + /// dictionary when the per-call language hint is not in the /// list. #[builder(default)] #[serde(default)] pub languages: Vec, - /// Require word-boundary surroundings on every match. With the - /// default of `true`, a term `"am"` matches the word `"am"` - /// but not the `"am"` inside `"example"`. Word characters are - /// alphanumerics and `_` (Unicode-aware). Set to `false` for - /// dictionaries that genuinely want substring matching (e.g. - /// scanning for embedded credentials inside arbitrary tokens). + /// Require word-boundary surroundings on every match. + /// + /// With the default of `true`, the term `"am"` matches the + /// word `"am"` but not the `"am"` inside `"example"`. Word + /// characters are Unicode alphanumerics and `_`. Set to + /// `false` to allow substring matches (e.g. scanning for + /// embedded credentials). #[builder(default = "true")] #[serde(default = "default_word_boundary")] pub word_boundary: bool, @@ -157,16 +157,18 @@ fn default_word_boundary() -> bool { } impl Dictionary { - /// Start a chainable builder. Required fields: `name`, - /// `label`, `terms`. + /// Start a chainable builder. + /// + /// Required fields: `name`, `label`, `terms`. #[must_use] pub fn builder() -> DictionaryBuilder { DictionaryBuilder::default() } - /// Parse a self-contained dictionary from a TOML string. The - /// TOML must include a `terms` field; for metadata-only TOML - /// paired with a separate term source, use + /// Parse a self-contained dictionary from a TOML source. + /// + /// The TOML must include a `terms` field; for metadata-only + /// TOML paired with a separate term source, use /// [`metadata_from_toml`] instead. /// /// # Errors @@ -180,15 +182,11 @@ impl Dictionary { .map_err(|e| Error::validation(format!("dictionary TOML: {e}"), "nvisy-pattern")) } - /// Parse the metadata fields of a dictionary from TOML (no - /// `terms` required) and return a seeded builder. The caller is - /// expected to chain - /// [`with_terms`] before - /// [`build`]. + /// Parse dictionary metadata from a sidecar TOML source. /// - /// Useful when shipped or user-supplied dictionaries split - /// metadata into a TOML sidecar and store the actual terms as - /// CSV / TXT. + /// The returned [`DictionaryBuilder`] is seeded with every + /// field except `terms`; callers chain [`with_terms`] (e.g. + /// loaded from a paired CSV/TXT) before [`build`]. /// /// # Errors /// @@ -217,8 +215,6 @@ impl Dictionary { } } -/// Wire shape for the dictionary metadata sidecar TOML — every -/// field [`Dictionary`] carries except `terms`. #[derive(Debug, Clone, Deserialize)] struct DictionaryMetadata { name: String, diff --git a/crates/nvisy-pattern/src/recognition/mod.rs b/crates/nvisy-pattern/src/recognition/mod.rs index d6d2d18e..e55bbb39 100644 --- a/crates/nvisy-pattern/src/recognition/mod.rs +++ b/crates/nvisy-pattern/src/recognition/mod.rs @@ -1,18 +1,19 @@ -//! Recognition primitives — the rule shapes ([`Regex`] + its -//! [`Variant`]s, [`Dictionary`]), their building blocks ([`Terms`]), -//! and the runtime [`PatternRecognizer`] that compiles them into -//! pooled scanners. Per-rule and per-dictionary `context` keyword -//! lists are harvested by the recognizer at build time into a -//! wrapping `Boosting` layer that applies post-recognition keyword -//! boosts. +//! Recognition primitives. +//! +//! Holds the rule shapes ([`Regex`] + its [`Variant`]s, [`Dictionary`]), +//! their building blocks ([`Terms`]), and the runtime +//! [`PatternRecognizer`] that compiles them into pooled scanners. +//! Per-rule and per-dictionary `context` keyword lists are harvested +//! by the recognizer at build time into a wrapping `Boosting` layer +//! that lifts confidence on matches near a declared keyword. mod compiled; mod dictionary; mod recognizer; mod regex; -mod terms; +mod term; pub use self::dictionary::{Dictionary, DictionaryBuilder, Scoring}; pub use self::recognizer::{PatternRecognizer, PatternRecognizerBuilder}; -pub use self::regex::{Regex, RegexBuilder, Variant, VariantBuilder}; -pub use self::terms::{Term, Terms}; +pub use self::regex::{Regex, RegexBuilder, Variant}; +pub use self::term::Term; diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index 01bc5533..fd133fdf 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -1,20 +1,4 @@ -//! [`PatternRecognizer`]: compiles patterns and dictionaries into -//! pooled scanners and implements [`EntityRecognizer`]. -//! -//! The internal split is intentional: regex patterns go into a -//! single [`regex::RegexSet`] for a one-pass scan across every -//! regex; dictionary terms go into a single -//! [`aho_corasick::AhoCorasick`] automaton for a one-pass scan -//! across every literal. Both passes share one walk over the input -//! and emit entities in modality-local byte coordinates. -//! -//! Construction is builder-driven: [`PatternRecognizer::builder`] -//! returns a [`PatternRecognizerBuilder`] that accumulates patterns, -//! dictionaries, and (optionally) a custom validator registry, then -//! compiles everything into the scanners on [`build`]. The shipped -//! built-in pattern + dictionary set is [`PatternRecognizerBuilder::builtin`]. -//! -//! [`build`]: PatternRecognizerBuilder::build +//! [`PatternRecognizer`] and its builder. use aho_corasick::{AhoCorasick, MatchKind}; use nvisy_context::{BoostRule, Boosting, Enhancer, SubstringMatcher}; @@ -30,8 +14,21 @@ use super::regex::Regex; use crate::shipped; use crate::validators::ValidatorRegistry; -/// Runtime text recognizer composed of one regex pool and one -/// Aho-Corasick automaton. +/// Runtime text recognizer composed of a regex pool and an Aho-Corasick automaton. +/// +/// Every registered [`Regex`] variant goes into one +/// [`::regex::RegexSet`] for a single one-pass scan across every +/// regex; every [`Dictionary`] term goes into one +/// [`::aho_corasick::AhoCorasick`] automaton for a single one-pass +/// scan across every literal. Both passes share one walk over the +/// input and emit entities in modality-local byte coordinates. +/// +/// Construct via [`PatternRecognizer::builder`]; the build wraps +/// the recognizer in a [`Boosting`] layer that lifts confidence on +/// matches whose neighbourhood contains a per-label context +/// keyword harvested from the same rules. +/// +/// # Examples /// /// ``` /// use nvisy_pattern::PatternRecognizer; @@ -42,6 +39,9 @@ use crate::validators::ValidatorRegistry; /// .build() /// .expect("built-in recognizer builds"); /// ``` +/// +/// [`Regex`]: super::Regex +/// [`Dictionary`]: super::Dictionary pub struct PatternRecognizer { patterns: Vec, regex_set: Option, @@ -50,11 +50,10 @@ pub struct PatternRecognizer { } impl PatternRecognizer { - /// Start a builder. Required: at least one pattern or - /// dictionary; otherwise [`build`] succeeds with a recognizer - /// that always emits zero entities. + /// Start a chainable builder. /// - /// [`build`]: PatternRecognizerBuilder::build + /// A recognizer built with no patterns and no dictionaries is + /// valid — it emits zero entities on every call. #[must_use] pub fn builder() -> PatternRecognizerBuilder { PatternRecognizerBuilder::default() @@ -67,9 +66,13 @@ impl PatternRecognizer { } } -/// Accumulates patterns, dictionaries, and a validator registry, -/// then compiles them into a [`PatternRecognizer`] wrapped in a -/// [`Boosting`] layer. +/// Accumulator of rules + validator registry for +/// [`PatternRecognizer`]. +/// +/// Patterns and dictionaries are stored as authored — compilation +/// into the pooled scanners happens in [`build`]. +/// +/// [`build`]: Self::build #[derive(Debug, Clone, Default)] pub struct PatternRecognizerBuilder { patterns: Vec, @@ -84,7 +87,9 @@ impl PatternRecognizerBuilder { Self::default() } - /// Pre-seed with the shipped built-in pattern + dictionary set. + /// Pre-seed with the shipped built-in patterns and + /// dictionaries. + /// /// Shorthand for /// `Self::new().with_builtin_patterns().with_builtin_dictionaries()`. #[must_use] @@ -94,7 +99,7 @@ impl PatternRecognizerBuilder { .with_builtin_dictionaries() } - /// Register one pattern. Patterns accumulate in registration + /// Register one pattern; patterns accumulate in registration /// order. #[must_use] pub fn with_pattern(mut self, pattern: Regex) -> Self { @@ -102,7 +107,7 @@ impl PatternRecognizerBuilder { self } - /// Register one dictionary. Dictionaries accumulate in + /// Register one dictionary; dictionaries accumulate in /// registration order. #[must_use] pub fn with_dictionary(mut self, dictionary: Dictionary) -> Self { @@ -124,18 +129,22 @@ impl PatternRecognizerBuilder { self } - /// Override the validator registry. When unset, the built-in - /// registry ([`ValidatorRegistry::builtin`]) is used. + /// Override the validator registry used to resolve variant + /// validator names. + /// + /// Defaults to [`ValidatorRegistry::builtin`] when unset. #[must_use] pub fn with_validators(mut self, registry: ValidatorRegistry) -> Self { self.validators = Some(registry); self } - /// Drop every pattern and dictionary whose `label` is not - /// registered in `catalog`. Used to build a per-request - /// recognizer from a workspace-wide template — rules that - /// would emit labels no policy declared never run. + /// Drop every pattern and dictionary whose label is not + /// declared in `catalog`. + /// + /// The engine uses this to build a per-request recognizer from + /// a workspace-wide template — rules that would emit labels no + /// policy declared never run. #[must_use] pub fn filter_by_catalog(mut self, catalog: &EntityLabelCatalog) -> Self { self.patterns @@ -145,10 +154,11 @@ impl PatternRecognizerBuilder { self } - /// `true` when the builder has no patterns and no - /// dictionaries. Engine code uses this to skip the - /// per-request recognizer entirely when the catalog filter - /// dropped every rule. + /// Return `true` when no patterns and no dictionaries are + /// registered. + /// + /// The engine uses this to skip the per-request recognizer + /// entirely after a catalog filter dropped every rule. #[must_use] pub fn is_empty(&self) -> bool { self.patterns.is_empty() && self.dictionaries.is_empty() @@ -166,10 +176,12 @@ impl PatternRecognizerBuilder { &self.dictionaries } - /// Compile every registered pattern and dictionary into the - /// pooled scanners and wrap the recognizer in a [`Boosting`] - /// layer carrying per-label keyword boosts harvested from the - /// same set of rules. + /// Compile every rule into the pooled scanners and wrap the + /// recognizer in a [`Boosting`] layer. + /// + /// Context keywords from every pattern and dictionary are + /// harvested into per-label [`BoostRule`]s that lift confidence + /// on matches whose neighbourhood contains a declared keyword. /// /// # Errors /// @@ -265,7 +277,7 @@ impl PatternRecognizerBuilder { } let term_start = all_terms.len(); let mut term_scores = Vec::with_capacity(dict.terms.len()); - for entry in dict.terms.entries() { + for entry in &dict.terms { all_terms.push(entry.term.clone()); // Per-term `score` wins when set; otherwise ask // the dictionary's `Scoring` to resolve against @@ -407,13 +419,13 @@ mod tests { use super::*; use crate::Dictionary; - use crate::recognition::terms::Terms; + use crate::recognition::term::Term; fn dict(name: &str, terms: &[&str], word_boundary: bool) -> Dictionary { Dictionary::builder() .with_name(name.to_owned()) .with_label(EntityLabelRef::from(builtins::LANGUAGE.name.clone())) - .with_terms(Terms::from(terms)) + .with_terms(terms.iter().copied().map(Term::new).collect::>()) .with_word_boundary(word_boundary) .build() .expect("dictionary builds") diff --git a/crates/nvisy-pattern/src/recognition/regex.rs b/crates/nvisy-pattern/src/recognition/regex.rs index 0d762fcc..f084f4a0 100644 --- a/crates/nvisy-pattern/src/recognition/regex.rs +++ b/crates/nvisy-pattern/src/recognition/regex.rs @@ -1,12 +1,4 @@ -//! [`Regex`]: per-label regex-based detection rule. -//! -//! A `Regex` rule bundles one entity label, its context-keyword -//! list, and one or more [`Variant`]s. Each variant carries its -//! own regex source, emission score, and optional named -//! validator. All variants under one rule emit the same label. -//! -//! Construct via [`Regex::builder`] for the chainable style or -//! [`Regex::from_toml`] when loading a definition file. +//! [`Regex`] rule and its [`Variant`]s. use derive_builder::Builder; use nvisy_core::Error; @@ -14,74 +6,107 @@ use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use serde::Deserialize; -/// One regex variant inside a [`Regex`] rule. Carries the regex -/// source, the emission confidence stamped on every match, and the -/// optional validator name resolved at recognizer-build time. -#[derive(Debug, Clone, PartialEq, Builder, Deserialize)] -#[builder( - name = "VariantBuilder", - pattern = "owned", - setter(into, strip_option, prefix = "with"), - build_fn(error = "Error", validate = "VariantBuilder::validate") -)] +/// One regex strategy inside a [`Regex`] rule. +/// +/// A variant pairs a regex source with the confidence stamped on +/// every match it produces and, optionally, a validator name +/// resolved against the [`ValidatorRegistry`] at recognizer-build +/// time so structurally-suspect matches can be dropped. +/// +/// [`ValidatorRegistry`]: crate::validators::ValidatorRegistry +#[derive(Debug, Clone, PartialEq, Deserialize)] pub struct Variant { /// Regex source. Compiled to a [`::regex::Regex`] by - /// [`PatternRecognizer::build`]; shape errors there, not here. + /// [`PatternRecognizer::build`]. /// /// [`PatternRecognizer::build`]: super::PatternRecognizer pub regex: String, - /// Confidence score stamped on every match this variant emits - /// before any post-recognition boost. - #[builder(default = "Confidence::MAX")] + /// Confidence stamped on every match, before any + /// post-recognition keyword boost. + #[serde(default = "default_score")] pub score: Confidence, - /// Optional validator name. Resolved at recognizer-build time - /// against the [`ValidatorRegistry`]; matches that fail - /// validation are dropped. + /// Validator name resolved against the [`ValidatorRegistry`]. + /// Matches that fail validation are dropped. /// /// [`ValidatorRegistry`]: crate::validators::ValidatorRegistry - #[builder(default)] #[serde(default)] pub validator: Option, } impl Variant { - /// Start a chainable builder. Required field: `regex`. - #[must_use] - pub fn builder() -> VariantBuilder { - VariantBuilder::default() - } -} - -impl VariantBuilder { - fn validate(&self) -> Result<(), Error> { - if let Some(regex) = self.regex.as_ref() - && let Err(e) = ::regex::Regex::new(regex) - { + /// Construct a variant from a regex source. + /// + /// `score` defaults to [`Confidence::MAX`] and `validator` to + /// `None`; override with [`with_score`] / [`with_validator`]. + /// + /// # Errors + /// + /// Returns a validation error when `regex` is not a valid + /// regular expression. + /// + /// [`with_score`]: Self::with_score + /// [`with_validator`]: Self::with_validator + pub fn new(regex: impl Into) -> Result { + let regex = regex.into(); + if let Err(e) = ::regex::Regex::new(®ex) { return Err(Error::validation( format!("invalid regex: {e}"), "nvisy-pattern", )); } - Ok(()) + Ok(Self { + regex, + score: Confidence::MAX, + validator: None, + }) + } + + /// Set the per-match confidence score. + #[must_use] + pub fn with_score(mut self, score: Confidence) -> Self { + self.score = score; + self + } + + /// Set the validator name to run on every match. + /// + /// The name is resolved against the [`ValidatorRegistry`] when + /// the parent [`PatternRecognizer`] is built; unknown names + /// surface as a build-time error. + /// + /// [`ValidatorRegistry`]: crate::validators::ValidatorRegistry + /// [`PatternRecognizer`]: super::PatternRecognizer + #[must_use] + pub fn with_validator(mut self, name: impl Into) -> Self { + self.validator = Some(name.into()); + self } } -/// Regex-based detection rule: one label, optional boost -/// keywords, one or more [`Variant`]s. Matches the Presidio -/// "pattern recognizer" shape — multiple regex strategies for one -/// entity type, plus a shared context keyword list. +fn default_score() -> Confidence { + Confidence::MAX +} + +/// Regex detection rule: one label, optional keyword boosts, and +/// one or more [`Variant`]s. +/// +/// Mirrors the Presidio "pattern recognizer" shape — several regex +/// strategies for one entity type, plus a shared context-keyword +/// list. Every variant emits the same [`label`]; context keywords +/// are harvested by [`PatternRecognizer`] into a wrapping boost +/// layer and are never read by the rule itself. +/// +/// # Examples /// /// ``` /// use nvisy_core::entity::builtins; /// use nvisy_core::primitive::Confidence; /// use nvisy_pattern::{Regex, Variant}; /// -/// let variant = Variant::builder() -/// .with_regex(r"\b\d{3}-\d{2}-\d{4}\b") +/// let variant = Variant::new(r"\b\d{3}-\d{2}-\d{4}\b") +/// .expect("ssn variant builds") /// .with_score(Confidence::clamped(0.9)) -/// .with_validator("ssn") -/// .build() -/// .expect("ssn variant builds"); +/// .with_validator("ssn"); /// /// let ssn = Regex::builder() /// .with_name("ssn") @@ -91,6 +116,9 @@ impl VariantBuilder { /// .build() /// .expect("ssn rule builds"); /// ``` +/// +/// [`label`]: Regex::label +/// [`PatternRecognizer`]: super::PatternRecognizer #[derive(Debug, Clone, PartialEq, Builder, Deserialize)] #[builder( name = "RegexBuilder", @@ -99,44 +127,37 @@ impl VariantBuilder { build_fn(error = "Error") )] pub struct Regex { - /// Human-readable identifier (e.g. `"ssn"`, `"credit_card"`). - /// Surfaced in trail steps so downstream consumers can see - /// which rule matched. + /// Human-readable identifier surfaced in trail provenance (e.g. + /// `"ssn"`, `"credit_card"`). pub name: String, /// Entity label every variant emits. pub label: EntityLabelRef, /// Context keywords that lift confidence when one of them - /// appears near a match. Harvested by [`PatternRecognizer`] - /// into a per-label boost rule; rules themselves never read - /// this field. - /// - /// [`PatternRecognizer`]: super::PatternRecognizer + /// appears near a match. #[builder(default)] #[serde(default)] pub context: Vec, - /// Regex variants. At least one is required for the rule to - /// produce any matches; the recognizer skips rules with no - /// variants. + /// Regex variants. At least one is required to produce matches; + /// the recognizer skips rules with an empty variant list. pub variants: Vec, - /// Languages this rule applies to (BCP-47 tags). An empty - /// list (the default) means the rule applies regardless of - /// language; otherwise the recognizer skips this rule when - /// the per-call language hint is set to a tag not in this - /// list. + /// BCP-47 language tags the rule applies to. Empty means "any + /// language"; otherwise the recognizer skips the rule when the + /// per-call language hint is not in the list. #[builder(default)] #[serde(default)] pub languages: Vec, } impl Regex { - /// Start a chainable builder. Required fields: `name`, - /// `label`, `variants`. + /// Start a chainable builder. + /// + /// Required fields: `name`, `label`, `variants`. #[must_use] pub fn builder() -> RegexBuilder { RegexBuilder::default() } - /// Parse a regex rule from a TOML string. + /// Parse a rule from a TOML source. /// /// # Errors /// diff --git a/crates/nvisy-pattern/src/recognition/term.rs b/crates/nvisy-pattern/src/recognition/term.rs new file mode 100644 index 00000000..613dbe0e --- /dev/null +++ b/crates/nvisy-pattern/src/recognition/term.rs @@ -0,0 +1,104 @@ +//! [`Term`]: one literal entry inside a [`Dictionary`]. +//! +//! [`Dictionary`]: crate::Dictionary + +use nvisy_core::Error; +use nvisy_core::primitive::Confidence; +use serde::Deserialize; + +/// One literal scanned for by a [`Dictionary`]. +/// +/// The `column` field is `Some(i)` for CSV-loaded terms and `None` +/// for plain-text or programmatic sources. The `score` field +/// overrides the dictionary's [`Scoring`] for this single term +/// when set — useful for one-off exceptions in an otherwise +/// uniform list. +/// +/// [`Dictionary`]: crate::Dictionary +/// [`Scoring`]: crate::Scoring +#[derive(Debug, Clone, PartialEq, Deserialize)] +pub struct Term { + /// The literal scanned for. + pub term: String, + /// CSV source-column index when loaded via [`Term::from_csv`]; + /// `None` otherwise. + #[serde(default)] + pub column: Option, + /// Per-term score override. When `Some`, the recognizer + /// stamps this score on every match; when `None`, falls back + /// to the dictionary's [`Scoring`] resolved against [`column`]. + /// + /// [`Scoring`]: crate::Scoring + /// [`column`]: Self::column + #[serde(default)] + pub score: Option, +} + +impl Term { + /// Construct a term with no column and no score override. + #[must_use] + pub fn new(term: impl Into) -> Self { + Self { + term: term.into(), + column: None, + score: None, + } + } + + /// Attach a CSV source-column index. + #[must_use] + pub fn with_column(mut self, column: u16) -> Self { + self.column = Some(column); + self + } + + /// Set a per-term score override. + #[must_use] + pub fn with_score(mut self, score: Confidence) -> Self { + self.score = Some(score); + self + } + + /// Parse a list of terms from plain text — one term per line. + /// + /// Each line is trimmed; empty lines and lines starting with + /// `#` are skipped. Plain-text terms carry no column. + pub fn from_text(raw: &str) -> Vec { + raw.lines() + .map(str::trim) + .filter(|line| !line.is_empty() && !line.starts_with('#')) + .map(Term::new) + .collect() + } + + /// Parse a list of terms from CSV. + /// + /// Every non-empty cell becomes a term tagged with its 0-based + /// source-column index. The dictionary's [`Scoring::PerColumn`] + /// uses that index to resolve a per-column confidence. + /// + /// # Errors + /// + /// Returns a validation error when the CSV is malformed. + /// + /// [`Scoring::PerColumn`]: crate::Scoring::PerColumn + pub fn from_csv(raw: &str) -> Result, Error> { + let mut reader = csv::ReaderBuilder::new() + .has_headers(false) + .flexible(true) + .from_reader(raw.as_bytes()); + let mut entries = Vec::new(); + for row in reader.records() { + let row = + row.map_err(|e| Error::validation(format!("terms CSV: {e}"), "nvisy-pattern"))?; + for (col_idx, cell) in row.iter().enumerate() { + let trimmed = cell.trim(); + if !trimmed.is_empty() { + let column = u16::try_from(col_idx).unwrap_or(u16::MAX); + entries.push(Term::new(trimmed).with_column(column)); + } + } + } + Ok(entries) + } +} diff --git a/crates/nvisy-pattern/src/recognition/terms.rs b/crates/nvisy-pattern/src/recognition/terms.rs deleted file mode 100644 index d59ec141..00000000 --- a/crates/nvisy-pattern/src/recognition/terms.rs +++ /dev/null @@ -1,208 +0,0 @@ -//! [`Terms`]: a literal-string list, the term source for -//! [`Dictionary`]. -//! -//! [`Dictionary`]: crate::Dictionary -//! -//! A `Terms` value is the bag of literals the recognizer's -//! Aho-Corasick automaton scans for. Construct it from any common -//! shape: -//! -//! - [`Terms::from`] — `Vec`, `&[&str]`, or `[&str; N]` -//! - [`Terms::from_text`] — one term per line, trimmed, with -//! `#`-prefixed comments and blank lines skipped -//! - [`Terms::from_csv`] — every non-empty cell across every row -//! becomes a term; each term remembers its source column index -//! so dictionaries can apply per-column confidence overrides - -use std::io::Cursor; - -use nvisy_core::Error; -use nvisy_core::primitive::Confidence; -use serde::Deserialize; - -/// Literal term list. Each [`Term`] carries an optional source -/// column (set by [`Terms::from_csv`]) plus an optional per-term -/// score override. The column index is the join key for -/// [`Dictionary::scoring`] when it's [`Scoring::PerColumn`]. -/// -/// [`Dictionary::scoring`]: crate::Dictionary::scoring -/// [`Scoring::PerColumn`]: crate::Scoring::PerColumn -#[derive(Debug, Clone, PartialEq, Default, Deserialize)] -#[serde(transparent)] -pub struct Terms(Vec); - -/// One entry in a [`Terms`] list: the literal, the column it was -/// loaded from (when applicable), and an optional explicit score -/// that overrides the dictionary's [`Scoring`] policy for this -/// term. -/// -/// Per-term score is `None` for the common path — the dictionary's -/// [`Scoring`] resolves the per-match score from the column. -/// Set `score` only for one-off exceptions (e.g. a term known to -/// be high-confidence even though its column is generally noisy). -/// -/// Per-term column is `None` for non-CSV sources (plain text -/// lists, the `From>` / array impls). `Some(i)` flags -/// a CSV cell from column `i`; the dictionary's -/// [`Scoring::PerColumn`] uses it to pick the per-column score. -/// -/// [`Scoring`]: crate::Scoring -/// [`Scoring::PerColumn`]: crate::Scoring::PerColumn -#[derive(Debug, Clone, PartialEq, Deserialize)] -pub struct Term { - /// The literal scanned for. - pub term: String, - /// CSV column the term came from. `None` for non-CSV - /// sources; `Some(i)` for the cell at column `i` of a CSV. - #[serde(default)] - pub column: Option, - /// Optional per-term score override. When `Some`, the - /// recognizer stamps this score on every match of this term; - /// when `None`, falls back to the dictionary's [`Scoring`] - /// policy resolved against [`column`]. - /// - /// [`Scoring`]: crate::Scoring - /// [`column`]: Self::column - #[serde(default)] - pub score: Option, -} - -impl Term { - /// Construct a term with no column and no per-term score - /// override. The common path for plain-text sources and - /// programmatic `From<…>` constructions. - #[must_use] - pub fn new(term: impl Into) -> Self { - Self { - term: term.into(), - column: None, - score: None, - } - } - - /// Attach a CSV source-column index, used by the dictionary's - /// [`Scoring::PerColumn`] to pick a per-column score. - /// - /// [`Scoring::PerColumn`]: crate::Scoring::PerColumn - #[must_use] - pub fn with_column(mut self, column: u16) -> Self { - self.column = Some(column); - self - } - - /// Set an explicit per-term score, overriding the dictionary's - /// column-resolved score for this term. - #[must_use] - pub fn with_score(mut self, score: Confidence) -> Self { - self.score = Some(score); - self - } -} - -impl Terms { - /// Construct an empty term list. - #[must_use] - pub fn new() -> Self { - Self(Vec::new()) - } - - /// Borrow the inner entries. - #[must_use] - pub fn entries(&self) -> &[Term] { - &self.0 - } - - /// Number of terms. - #[must_use] - pub fn len(&self) -> usize { - self.0.len() - } - - /// Whether this list contains no terms. - #[must_use] - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } - - /// Consume into the inner entries. - #[must_use] - pub fn into_inner(self) -> Vec { - self.0 - } - - /// Parse terms from plain-text bytes — one term per line. - /// Each line is trimmed; empty lines and lines starting with - /// `#` are skipped. Plain-text terms carry no column. - /// - /// # Errors - /// - /// Returns a validation error when the input is not valid - /// UTF-8. - pub fn from_text(bytes: &[u8]) -> Result { - let text = std::str::from_utf8(bytes) - .map_err(|e| Error::validation(format!("terms text: {e}"), "nvisy-pattern"))?; - let entries: Vec = text - .lines() - .map(str::trim) - .filter(|line| !line.is_empty() && !line.starts_with('#')) - .map(Term::new) - .collect(); - Ok(Self(entries)) - } - - /// Parse terms from CSV bytes. Every non-empty cell across - /// every row becomes a term, and each term remembers the - /// (0-based) column index it came from so a [`Dictionary`] - /// can apply per-column confidence overrides via - /// [`Scoring::PerColumn`]. - /// - /// # Errors - /// - /// Returns a validation error when the CSV is malformed. - /// - /// [`Dictionary`]: crate::Dictionary - /// [`Scoring::PerColumn`]: crate::Scoring::PerColumn - pub fn from_csv(bytes: &[u8]) -> Result { - let mut reader = csv::ReaderBuilder::new() - .has_headers(false) - .flexible(true) - .from_reader(Cursor::new(bytes)); - let mut entries = Vec::new(); - for row in reader.records() { - let row = - row.map_err(|e| Error::validation(format!("terms CSV: {e}"), "nvisy-pattern"))?; - for (col_idx, cell) in row.iter().enumerate() { - let trimmed = cell.trim(); - if !trimmed.is_empty() { - let column = u16::try_from(col_idx).unwrap_or(u16::MAX); - entries.push(Term::new(trimmed).with_column(column)); - } - } - } - Ok(Self(entries)) - } -} - -impl From> for Terms { - fn from(terms: Vec) -> Self { - Self(terms.into_iter().map(Term::new).collect()) - } -} - -impl From<&[&str]> for Terms { - fn from(terms: &[&str]) -> Self { - Self(terms.iter().copied().map(Term::new).collect()) - } -} - -impl From<[&str; N]> for Terms { - fn from(terms: [&str; N]) -> Self { - Self(terms.iter().copied().map(Term::new).collect()) - } -} - -impl From<[String; N]> for Terms { - fn from(terms: [String; N]) -> Self { - Self(terms.into_iter().map(Term::new).collect()) - } -} diff --git a/crates/nvisy-pattern/src/shipped/dictionaries.rs b/crates/nvisy-pattern/src/shipped/dictionaries.rs index 00c1e504..dac8baa8 100644 --- a/crates/nvisy-pattern/src/shipped/dictionaries.rs +++ b/crates/nvisy-pattern/src/shipped/dictionaries.rs @@ -1,21 +1,21 @@ //! Built-in [`Dictionary`]s, embedded at compile time. //! //! Each accessor pairs a TOML metadata sidecar -//! (`assets/dictionaries/**/*.toml`) with a term source -//! (`*.csv` for multi-column term lists, `*.txt` for one-per-line), -//! merging them via [`Dictionary::metadata_from_toml`] + -//! [`Terms::from_csv`] / [`Terms::from_text`]. +//! (`assets/dictionaries/**/*.toml`) with a term source (`*.csv` +//! for multi-column term lists, `*.txt` for one-per-line), merging +//! them via [`Dictionary::metadata_from_toml`] + [`Term::from_csv`] +//! / [`Term::from_text`]. //! //! [`Dictionary`]: crate::Dictionary -use crate::recognition::{Dictionary, Terms}; +use crate::recognition::{Dictionary, Term}; macro_rules! shipped_dictionary { ($(#[$meta:meta])* fn $name:ident from $meta_path:literal with csv $terms:literal) => { $(#[$meta])* #[must_use] pub fn $name() -> Dictionary { - let terms = Terms::from_csv(include_bytes!(concat!( + let terms = Term::from_csv(include_str!(concat!( "../../assets/dictionaries/", $terms ))) @@ -34,11 +34,10 @@ macro_rules! shipped_dictionary { $(#[$meta])* #[must_use] pub fn $name() -> Dictionary { - let terms = Terms::from_text(include_bytes!(concat!( + let terms = Term::from_text(include_str!(concat!( "../../assets/dictionaries/", $terms - ))) - .expect(concat!("shipped term source `", $terms, "` parses")); + ))); Dictionary::metadata_from_toml(include_str!(concat!( "../../assets/dictionaries/", $meta_path diff --git a/crates/nvisy-pattern/src/shipped/mod.rs b/crates/nvisy-pattern/src/shipped/mod.rs index 062acea8..db9c7fc5 100644 --- a/crates/nvisy-pattern/src/shipped/mod.rs +++ b/crates/nvisy-pattern/src/shipped/mod.rs @@ -1,14 +1,11 @@ //! Built-in [`Regex`] rules and [`Dictionary`]s shipped with this //! crate. //! -//! Each accessor parses an asset file embedded via -//! [`include_str!`] and returns a fresh [`Regex`] or -//! [`Dictionary`]. Metadata for dictionaries (entity label, score, -//! context) is split into a TOML sidecar paired with a CSV / TXT -//! term source; regex rules are self-contained TOML. -//! -//! Use [`patterns::all`] and [`dictionaries::all`] to load the -//! complete shipped set, or pick individual accessors. +//! Each accessor parses an asset embedded via [`include_str!`] and +//! returns a fresh value. Dictionaries split metadata into a TOML +//! sidecar paired with a CSV/TXT term source; regex rules are +//! self-contained TOML. Call [`patterns::all`] / [`dictionaries::all`] +//! to load the full set, or pick individual accessors. //! //! [`Regex`]: crate::Regex //! [`Dictionary`]: crate::Dictionary diff --git a/crates/nvisy-pattern/src/validators/date.rs b/crates/nvisy-pattern/src/validators/date.rs index bcf30246..7d35d0d1 100644 --- a/crates/nvisy-pattern/src/validators/date.rs +++ b/crates/nvisy-pattern/src/validators/date.rs @@ -1,22 +1,18 @@ -//! Date structural validation. -//! -//! Validates that a regex-matched date string represents a real calendar -//! date. Supports multiple common formats. +//! Calendar-date structural validator. /// Return `true` if `value` is a real calendar date in one of the /// supported written formats. /// -/// Supported: `MM/DD/YYYY`, `DD/MM/YYYY`, `YYYY-MM-DD`, `YYYY/MM/DD` -/// (with `/` or `-` separators). Leap years are honoured and the -/// year must fall in `1900..=2100`. +/// Supported formats are `MM/DD/YYYY`, `DD/MM/YYYY`, `YYYY-MM-DD`, +/// and `YYYY/MM/DD`, with `/` or `-` as separators. Leap years +/// are honoured and the year must fall in `1900..=2100`. /// /// # Ambiguity /// -/// When both interpretations are valid (e.g. `02/03/1999` could mean -/// Feb 3 or 3 Mar), the validator prefers `MM/DD/YYYY` and only falls -/// back to `DD/MM/YYYY` if the first part is not a valid month. This -/// is a format-level structural check — locale disambiguation is out -/// of scope. +/// When both interpretations are valid (e.g. `02/03/1999` could +/// mean Feb 3 or 3 Mar), the validator prefers `MM/DD/YYYY` and +/// only falls back to `DD/MM/YYYY` when the first part is not a +/// valid month. Locale disambiguation is out of scope. pub fn date(value: &str) -> bool { let parts: Vec<&str> = value.split(['/', '-']).collect(); if parts.len() != 3 { diff --git a/crates/nvisy-pattern/src/validators/iban.rs b/crates/nvisy-pattern/src/validators/iban.rs index 0df4a542..15888d7f 100644 --- a/crates/nvisy-pattern/src/validators/iban.rs +++ b/crates/nvisy-pattern/src/validators/iban.rs @@ -1,12 +1,12 @@ -//! IBAN checksum validator (ISO 13616). -//! -//! Rearranges the IBAN so the country code and check digits move to the -//! end, converts letters to numbers (A=10 … Z=35), and verifies that -//! the resulting number mod 97 equals 1. +//! ISO 13616 IBAN checksum validator. -/// Return `true` if `value` passes the ISO 13616 mod-97 IBAN check. +/// Return `true` if `value` passes the ISO 13616 mod-97 IBAN +/// checksum. /// -/// Whitespace and dashes are stripped before validation. +/// Whitespace and dashes are stripped before validation. The +/// country code and check digits are moved to the end, letters +/// are converted to numbers (`A`=10 … `Z`=35), and the result is +/// accepted when `mod 97 == 1`. pub fn iban(value: &str) -> bool { let cleaned: String = value .chars() diff --git a/crates/nvisy-pattern/src/validators/luhn.rs b/crates/nvisy-pattern/src/validators/luhn.rs index 40bb5bc0..878728ac 100644 --- a/crates/nvisy-pattern/src/validators/luhn.rs +++ b/crates/nvisy-pattern/src/validators/luhn.rs @@ -1,20 +1,16 @@ -//! Luhn checksum validator. -//! -//! Implements the [Luhn algorithm] used to validate credit/debit card -//! numbers and other identification numbers. Only digits, spaces, and -//! dashes are accepted as input: any other character causes the check -//! to fail. -//! -//! [Luhn algorithm]: https://en.wikipedia.org/wiki/Luhn_algorithm +//! Luhn checksum validator for credit-card and similar identifier +//! numbers. -/// Return `true` if `num` passes the Luhn checksum. +/// Return `true` if `num` passes the [Luhn algorithm] checksum. /// /// Spaces and dashes are stripped before validation, so /// `"4539 1488 0343 6467"`, `"4539-1488-0343-6467"`, and -/// `"4539148803436467"` are all equivalent. +/// `"4539148803436467"` are equivalent inputs. /// -/// Returns `false` if the input is empty or contains characters other -/// than digits, spaces, and dashes. +/// Returns `false` when the input is empty or contains any +/// character other than digits, spaces, and dashes. +/// +/// [Luhn algorithm]: https://en.wikipedia.org/wiki/Luhn_algorithm pub fn luhn(num: &str) -> bool { if num.is_empty() { return false; diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index f384b762..46395652 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -1,20 +1,19 @@ -//! Post-match validators for detected entity values. +//! Post-match validators for regex-detected entity values. //! -//! A [`Variant`] inside a [`Regex`] rule can reference a validator -//! by name (e.g. `validator: Some("luhn")`) to reduce false -//! positives. At [`PatternRecognizer::build`] time the name is -//! resolved against a [`ValidatorRegistry`] to a concrete -//! validation function. +//! A [`Variant`] inside a [`Regex`] rule may name a validator +//! (e.g. `validator: Some("luhn")`); the recognizer resolves the +//! name against a [`ValidatorRegistry`] at build time and drops +//! matches that fail the resolved check. Use validators to weed +//! out structurally-suspect false positives that a regex alone +//! can't. //! -//! The default [`ValidatorRegistry::builtin`] ships with five -//! validators — `luhn`, `iban`, `ssn`, `phone`, `date`. Consumers -//! can extend the registry with their own validators by calling -//! [`ValidatorRegistry::with`] before handing it to the recognizer -//! builder. +//! [`ValidatorRegistry::builtin`] ships with [`luhn`], [`iban`], +//! [`ssn`], [`phone`], and [`date`]. Each validator is also +//! re-exported as a free function so consumers can compose a +//! custom registry without taking the full set. //! //! [`Variant`]: crate::Variant //! [`Regex`]: crate::Regex -//! [`PatternRecognizer::build`]: crate::PatternRecognizer mod date; mod iban; @@ -22,25 +21,25 @@ mod luhn; mod phone; mod ssn; +use std::borrow::Cow; +use std::collections::HashMap; +use std::sync::Arc; + pub use self::date::date; pub use self::iban::iban; pub use self::luhn::luhn; pub use self::phone::phone; pub use self::ssn::ssn; -use std::borrow::Cow; -use std::collections::HashMap; -use std::sync::Arc; - -/// Post-match validator: returns `true` when `matched` passes the -/// validator's check. +/// Post-match validator returning whether a matched string is +/// structurally valid. /// -/// Implemented by both built-in function-pointer validators (via the -/// blanket impl) and any third-party validator types a consumer -/// registers. +/// Implemented by every `Fn(&str) -> bool + Send + Sync` via the +/// blanket impl, so plain function pointers slot in without a +/// wrapper type. Implement directly for types that need to carry +/// state (e.g. a remote-lookup client). pub trait Validator: Send + Sync { - /// Validate the text the recognizer matched. Returns `true` to - /// keep the match, `false` to drop it. + /// Return `true` to keep the match, `false` to drop it. fn validate(&self, matched: &str) -> bool; } @@ -53,33 +52,30 @@ where } } -/// Resolves validator names referenced in [`Variant`] definitions -/// to concrete [`Validator`] implementations. -/// -/// Keys are [`Cow<'static, str>`] so the built-in registrations skip -/// any allocation (`&'static str` literal → borrowed variant) while -/// caller-supplied names that aren't `'static` (e.g. dynamically -/// constructed at runtime) still flow through as owned `String`s. +/// Name → validator resolver consulted at recognizer-build time. /// -/// [`Variant`]: crate::Variant +/// Keys are [`Cow<'static, str>`] so a `&'static str` literal stays +/// borrowed while a runtime-built name flows through as an owned +/// `String`. #[derive(Clone, Default)] pub struct ValidatorRegistry { table: HashMap, Arc>, } impl ValidatorRegistry { - /// Empty registry — no validators registered. Regex rules that - /// reference a validator name will fail to resolve at recognizer - /// build time. + /// Construct an empty registry. + /// + /// Any [`Variant`] referencing a validator name will fail to + /// resolve at recognizer-build time. + /// + /// [`Variant`]: crate::Variant #[must_use] pub fn empty() -> Self { Self::default() } - /// Registry pre-loaded with every built-in validator: [`luhn`], - /// [`iban`], [`ssn`], [`phone`], [`date`]. Each is also - /// re-exported individually from this module so consumers can - /// mix-and-match without taking all five. + /// Construct a registry pre-loaded with the built-in + /// validators: [`luhn`], [`iban`], [`ssn`], [`phone`], [`date`]. #[must_use] pub fn builtin() -> Self { Self::empty() @@ -90,16 +86,11 @@ impl ValidatorRegistry { .with("date", date) } - /// Register `validator` under `name`. Overwrites any previous - /// entry with the same name. - /// - /// Built-ins live under `"luhn"`, `"iban"`, `"ssn"`, `"phone"`, - /// and `"date"`; consumers can override them with their own - /// implementations by registering under the same name. + /// Register `validator` under `name`, overwriting any previous + /// entry with the same key. /// - /// `name` accepts anything convertible to [`Cow<'static, str>`] - /// — a `&'static str` literal stays borrowed (zero allocation), - /// an owned `String` becomes the owned variant. + /// Override a built-in by registering under the same name + /// (e.g. `"luhn"`). #[must_use] pub fn with(mut self, name: N, validator: V) -> Self where @@ -110,8 +101,11 @@ impl ValidatorRegistry { self } - /// Look up a validator by name, returning the registered - /// implementation or `None` when the name is unknown. + /// Look up a validator by name. + /// + /// Returns `None` when the name is unregistered; the + /// recognizer's build step surfaces that as a configuration + /// error. #[must_use] pub fn resolve(&self, name: &str) -> Option> { self.table.get(name).cloned() diff --git a/crates/nvisy-pattern/src/validators/phone.rs b/crates/nvisy-pattern/src/validators/phone.rs index d503ba1f..51d4cdd6 100644 --- a/crates/nvisy-pattern/src/validators/phone.rs +++ b/crates/nvisy-pattern/src/validators/phone.rs @@ -1,17 +1,14 @@ -//! Phone number structural validation. -//! -//! Validates that a regex-matched phone number has a plausible structure: -//! correct digit count and no obviously invalid prefixes. +//! Phone-number structural validator. /// Return `true` if `value` has a plausible phone-number structure. /// -/// Strips all non-digit characters, then checks: +/// All non-digit characters are stripped, then checks: /// -/// - 7 to 15 digits (ITU-T E.164 range) -/// - When the original begins with `+` (explicit E.164), the digits -/// must not start with 0 (no country code is `0…`). National formats -/// such as UK `020 7946 0958` keep their trunk-prefix zero and remain -/// valid. +/// - 7 to 15 digits (the ITU-T E.164 range). +/// - When the original begins with `+` (explicit E.164), the +/// digits must not start with `0` — no country code is `0…`. +/// National formats such as UK `020 7946 0958` keep their +/// trunk-prefix zero and remain valid. pub fn phone(value: &str) -> bool { let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); let len = digits.len(); diff --git a/crates/nvisy-pattern/src/validators/ssn.rs b/crates/nvisy-pattern/src/validators/ssn.rs index 46258064..223408eb 100644 --- a/crates/nvisy-pattern/src/validators/ssn.rs +++ b/crates/nvisy-pattern/src/validators/ssn.rs @@ -1,15 +1,16 @@ //! US Social Security Number format validator. -//! -//! Validates the `AAA-GG-SSSS` format where: -//! -//! - **Area** (AAA): 001–899, excluding 666. -//! - **Group** (GG): 01–99. -//! - **Serial** (SSSS): 0001–9999. -/// Return `true` if `value` is a structurally valid US SSN in `AAA-GG-SSSS` -/// format. +/// Return `true` if `value` is a structurally valid US SSN in +/// `AAA-GG-SSSS` format. /// -/// This is a format check, not a verification against SSA records. +/// Validates the three parts as: +/// +/// - **Area** (`AAA`): 001–899, excluding 666. +/// - **Group** (`GG`): 01–99. +/// - **Serial** (`SSSS`): 0001–9999. +/// +/// This is a format check only — not a verification against SSA +/// records. pub fn ssn(value: &str) -> bool { let parts: Vec<&str> = value.split('-').collect(); if parts.len() != 3 { diff --git a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs index 6c1fca87..12d3c7f2 100644 --- a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs +++ b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs @@ -10,15 +10,13 @@ use nvisy_core::entity::{TrailStepKind, builtins}; use nvisy_core::modality::TextData; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::{Regex, PatternRecognizer, Variant}; +use nvisy_pattern::{PatternRecognizer, Regex, Variant}; #[tokio::test] async fn enhancer_boosts_matches_near_keyword_only() { - let variant = Variant::builder() - .with_regex(r"\b\d{3}-\d{2}-\d{4}\b") - .with_score(Confidence::clamped(0.6)) - .build() - .expect("ssn variant builds"); + let variant = Variant::new(r"\b\d{3}-\d{2}-\d{4}\b") + .expect("ssn variant builds") + .with_score(Confidence::clamped(0.6)); let regex = Regex::builder() .with_name("ssn") .with_label(builtins::GOVERNMENT_ID.label_ref()) diff --git a/crates/nvisy-pattern/tests/user_rules.rs b/crates/nvisy-pattern/tests/user_rules.rs index 38dcee10..589e6464 100644 --- a/crates/nvisy-pattern/tests/user_rules.rs +++ b/crates/nvisy-pattern/tests/user_rules.rs @@ -2,14 +2,14 @@ //! shape (`testdata/patterns/*.toml`, //! `testdata/dictionaries/*.{toml,csv}`) through //! [`Regex::from_toml`], [`Dictionary::metadata_from_toml`], and -//! [`Terms::from_csv`], mix them with shipped patterns, and +//! [`Term::from_csv`], mix them with shipped patterns, and //! confirm a real internal-handoff document yields the custom //! entities. use nvisy_core::entity::builtins; use nvisy_core::modality::TextData; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::{Dictionary, Regex, PatternRecognizer, Terms}; +use nvisy_pattern::{Dictionary, PatternRecognizer, Regex, Term}; #[tokio::test] async fn user_toml_rules_load_and_detect() { @@ -19,7 +19,7 @@ async fn user_toml_rules_load_and_detect() { Regex::from_toml(include_str!("../testdata/patterns/product_codes.toml")) .expect("product_codes.toml parses"); - let terms = Terms::from_csv(include_bytes!("../testdata/dictionaries/product_codes.csv")) + let terms = Term::from_csv(include_str!("../testdata/dictionaries/product_codes.csv")) .expect("product_codes.csv parses"); let product_code_dict = Dictionary::metadata_from_toml(include_str!("../testdata/dictionaries/product_codes.toml")) From ae12d261c9265a3b0b9373e1421aa46f51ce8d48 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sun, 14 Jun 2026 23:40:16 +0200 Subject: [PATCH 06/14] =?UTF-8?q?refactor(pattern,context):=20rename=20Boo?= =?UTF-8?q?sting=E2=86=92ContextEnhanced,=20split=20build()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvisy-context: `Boosting` → `ContextEnhanced` (more self-descriptive; reads as "an R that's been context-enhanced"). - nvisy-pattern: `PatternRecognizerBuilder::build()` now returns the bare `PatternRecognizer`; the wrapped form moves to `build_context_enhanced() -> ContextEnhanced`. Callers opt into the keyword-boost layer explicitly. - Engine config, shipped-detection / user-rules / enhancer roundtrip tests, toolkit fixtures + example flipped to `build_context_enhanced()` to preserve prior behavior. - README + module/struct docs rewritten to describe both methods without historical framing. Co-Authored-By: Claude Opus 4.7 --- crates/nvisy-context/src/lib.rs | 2 +- crates/nvisy-context/src/wrapper.rs | 10 ++-- .../nvisy-engine/src/detection/config/mod.rs | 2 +- crates/nvisy-pattern/README.md | 9 ++-- .../nvisy-pattern/src/recognition/compiled.rs | 2 +- crates/nvisy-pattern/src/recognition/mod.rs | 13 ++--- .../src/recognition/recognizer.rs | 53 +++++++++++++------ .../nvisy-pattern/tests/enhancer_roundtrip.rs | 8 +-- .../nvisy-pattern/tests/shipped_detection.rs | 2 +- crates/nvisy-pattern/tests/user_rules.rs | 2 +- crates/nvisy-toolkit/Cargo.toml | 2 +- crates/nvisy-toolkit/examples/pipeline.rs | 2 +- .../tests/fixtures/registries.rs | 8 +-- .../tests/recognition_registry.rs | 2 +- 14 files changed, 71 insertions(+), 46 deletions(-) diff --git a/crates/nvisy-context/src/lib.rs b/crates/nvisy-context/src/lib.rs index 192796f2..244e113c 100644 --- a/crates/nvisy-context/src/lib.rs +++ b/crates/nvisy-context/src/lib.rs @@ -12,4 +12,4 @@ pub use self::enhancer::Enhancer; pub use self::matcher::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; pub use self::rule::{BoostRule, DEFAULT_BOOST, DEFAULT_PREFIX_WORDS, DEFAULT_SUFFIX_WORDS}; pub use self::tokens::{Token, Tokens}; -pub use self::wrapper::Boosting; +pub use self::wrapper::ContextEnhanced; diff --git a/crates/nvisy-context/src/wrapper.rs b/crates/nvisy-context/src/wrapper.rs index 87105b19..c4128f0f 100644 --- a/crates/nvisy-context/src/wrapper.rs +++ b/crates/nvisy-context/src/wrapper.rs @@ -1,4 +1,4 @@ -//! [`Boosting`]: post-recognition keyword-boost wrapper for any +//! [`ContextEnhanced`]: post-recognition keyword-boost wrapper for any //! [`EntityRecognizer`]. //! //! Composes an inner recognizer with an [`Enhancer`]: the wrapper @@ -12,7 +12,7 @@ //! ```ignore //! let inner = MyRecognizer::new(...); //! let enhancer = Enhancer::new(rules, Box::new(SubstringMatcher)); -//! let recognizer = Boosting::new(inner, enhancer); +//! let recognizer = ContextEnhanced::new(inner, enhancer); //! ``` //! //! The wrapper implements [`EntityRecognizer`] so the engine @@ -34,12 +34,12 @@ use super::{Enhancer, Tokens}; /// same `&str` for the keyword-window walk; a recognizer that /// emitted entities relative to a different coordinate space /// would surface stale or panic-on-slice offsets. -pub struct Boosting { +pub struct ContextEnhanced { inner: R, enhancer: Enhancer, } -impl Boosting { +impl ContextEnhanced { /// Wrap `inner` with `enhancer`. After `recognize` produces /// entities, `enhancer` runs over them in place. pub fn new(inner: R, enhancer: Enhancer) -> Self { @@ -59,7 +59,7 @@ impl Boosting { } #[async_trait::async_trait] -impl EntityRecognizer for Boosting +impl EntityRecognizer for ContextEnhanced where R: EntityRecognizer + 'static, { diff --git a/crates/nvisy-engine/src/detection/config/mod.rs b/crates/nvisy-engine/src/detection/config/mod.rs index 3f74c9ac..ec937436 100644 --- a/crates/nvisy-engine/src/detection/config/mod.rs +++ b/crates/nvisy-engine/src/detection/config/mod.rs @@ -86,7 +86,7 @@ impl DetectionConfig { .with_builtin_dictionaries() .filter_by_catalog(catalog); if !builder.is_empty() { - reg = reg.with_recognizer::(builder.build()?); + reg = reg.with_recognizer::(builder.build_context_enhanced()?); } } diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index 7d299119..f39a99e2 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -15,10 +15,11 @@ shared `regex::RegexSet` for the regex side and one shared single walk over the input runs both scanners and emits `Entity` values in modality-local byte coordinates. -Each rule may declare per-label context keywords; the recognizer -wraps itself in a `nvisy_context::Boosting` layer at build time -that lifts confidence on matches whose neighbourhood contains a -declared keyword. +Rules may declare per-label context keywords. Calling +`build_context_enhanced()` wraps the recognizer in a +`nvisy_context::ContextEnhanced` layer that lifts confidence on +matches whose neighbourhood contains a declared keyword; +`build()` returns the bare recognizer. The built-in pattern + dictionary set lives as TOML under `assets/` and is embedded at compile time. The recognizer's diff --git a/crates/nvisy-pattern/src/recognition/compiled.rs b/crates/nvisy-pattern/src/recognition/compiled.rs index 1283025f..d1a61556 100644 --- a/crates/nvisy-pattern/src/recognition/compiled.rs +++ b/crates/nvisy-pattern/src/recognition/compiled.rs @@ -32,7 +32,7 @@ use crate::validators::Validator; /// indirection. /// /// `context` is intentionally not stored on compiled state — the -/// recognizer's wrapping `Boosting` layer harvests keywords from +/// recognizer's wrapping `ContextEnhanced` layer harvests keywords from /// the source patterns at build time. pub(super) struct CompiledPattern { /// Pattern name (e.g. `"ssn"`). Surfaced in trail provenance. diff --git a/crates/nvisy-pattern/src/recognition/mod.rs b/crates/nvisy-pattern/src/recognition/mod.rs index e55bbb39..e9be1ed0 100644 --- a/crates/nvisy-pattern/src/recognition/mod.rs +++ b/crates/nvisy-pattern/src/recognition/mod.rs @@ -1,11 +1,12 @@ //! Recognition primitives. //! -//! Holds the rule shapes ([`Regex`] + its [`Variant`]s, [`Dictionary`]), -//! their building blocks ([`Terms`]), and the runtime -//! [`PatternRecognizer`] that compiles them into pooled scanners. -//! Per-rule and per-dictionary `context` keyword lists are harvested -//! by the recognizer at build time into a wrapping `Boosting` layer -//! that lifts confidence on matches near a declared keyword. +//! Holds the rule shapes ([`Regex`] + its [`Variant`]s, +//! [`Dictionary`]), their building blocks ([`Term`]), and the +//! runtime [`PatternRecognizer`] that compiles them into pooled +//! scanners. Per-rule and per-dictionary `context` keyword lists +//! are harvested by [`PatternRecognizerBuilder::build_context_enhanced`] +//! into a wrapping `ContextEnhanced` layer that lifts confidence +//! on matches near a declared keyword. mod compiled; mod dictionary; diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index fd133fdf..f4ad3db9 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -1,7 +1,7 @@ //! [`PatternRecognizer`] and its builder. use aho_corasick::{AhoCorasick, MatchKind}; -use nvisy_context::{BoostRule, Boosting, Enhancer, SubstringMatcher}; +use nvisy_context::{BoostRule, ContextEnhanced, Enhancer, SubstringMatcher}; use nvisy_core::entity::{Entity, EntityLabelCatalog, EntityLabelRef}; use nvisy_core::modality::Text; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; @@ -14,7 +14,8 @@ use super::regex::Regex; use crate::shipped; use crate::validators::ValidatorRegistry; -/// Runtime text recognizer composed of a regex pool and an Aho-Corasick automaton. +/// Runtime text recognizer composed of a regex pool and an +/// Aho-Corasick automaton. /// /// Every registered [`Regex`] variant goes into one /// [`::regex::RegexSet`] for a single one-pass scan across every @@ -23,10 +24,11 @@ use crate::validators::ValidatorRegistry; /// scan across every literal. Both passes share one walk over the /// input and emit entities in modality-local byte coordinates. /// -/// Construct via [`PatternRecognizer::builder`]; the build wraps -/// the recognizer in a [`Boosting`] layer that lifts confidence on +/// Construct via [`PatternRecognizer::builder`]. [`build`] +/// returns the bare recognizer; [`build_context_enhanced`] wraps +/// it in a [`ContextEnhanced`] layer that lifts confidence on /// matches whose neighbourhood contains a per-label context -/// keyword harvested from the same rules. +/// keyword. /// /// # Examples /// @@ -42,6 +44,8 @@ use crate::validators::ValidatorRegistry; /// /// [`Regex`]: super::Regex /// [`Dictionary`]: super::Dictionary +/// [`build`]: PatternRecognizerBuilder::build +/// [`build_context_enhanced`]: PatternRecognizerBuilder::build_context_enhanced pub struct PatternRecognizer { patterns: Vec, regex_set: Option, @@ -176,12 +180,14 @@ impl PatternRecognizerBuilder { &self.dictionaries } - /// Compile every rule into the pooled scanners and wrap the - /// recognizer in a [`Boosting`] layer. + /// Compile every rule into the pooled scanners and return the + /// bare recognizer. /// - /// Context keywords from every pattern and dictionary are - /// harvested into per-label [`BoostRule`]s that lift confidence - /// on matches whose neighbourhood contains a declared keyword. + /// Per-rule `context` keywords are ignored on the emission + /// path; the recognizer emits raw confidence as authored by + /// each rule. Wrap the result with [`build_context_enhanced`] + /// (or compose with [`ContextEnhanced`] manually) to lift + /// confidence on matches near a declared keyword. /// /// # Errors /// @@ -190,23 +196,40 @@ impl PatternRecognizerBuilder { /// validator name, when a dictionary's `scoring` is invalid /// or under-declared for some term's source column, or when /// the shared automata cannot be constructed. - pub fn build(self) -> Result> { + /// + /// [`build_context_enhanced`]: Self::build_context_enhanced + pub fn build(self) -> Result { let validators = self .validators .clone() .unwrap_or_else(ValidatorRegistry::builtin); let (compiled_patterns, regex_set) = self.compile_patterns(&validators)?; let (compiled_dicts, aho) = self.compile_dictionaries()?; - let enhancer = self.build_enhancer(); - let recognizer = PatternRecognizer { + Ok(PatternRecognizer { patterns: compiled_patterns, regex_set, dictionaries: compiled_dicts, aho, - }; + }) + } - Ok(Boosting::new(recognizer, enhancer)) + /// Compile every rule and wrap the recognizer in a + /// [`ContextEnhanced`] layer. + /// + /// Context keywords from every pattern and dictionary are + /// harvested into per-label [`BoostRule`]s that lift confidence + /// on matches whose neighbourhood contains a declared keyword. + /// + /// # Errors + /// + /// See [`build`]. + /// + /// [`build`]: Self::build + pub fn build_context_enhanced(self) -> Result> { + let enhancer = self.build_enhancer(); + let recognizer = self.build()?; + Ok(ContextEnhanced::new(recognizer, enhancer)) } /// Compile every `(pattern, variant)` pair into a diff --git a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs index 12d3c7f2..057a5708 100644 --- a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs +++ b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs @@ -1,10 +1,10 @@ //! End-to-end: feed real input through a [`Regex`] → -//! [`PatternRecognizer`] (wrapped in [`Boosting`]) and verify +//! [`PatternRecognizer`] (wrapped in [`ContextEnhanced`]) and verify //! that confidence is boosted, and a [`Refinement`] step is //! appended only for matches that had a nearby keyword. //! //! [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement -//! [`Boosting`]: nvisy_context::Boosting +//! [`ContextEnhanced`]: nvisy_context::ContextEnhanced use nvisy_core::entity::{TrailStepKind, builtins}; use nvisy_core::modality::TextData; @@ -27,7 +27,7 @@ async fn enhancer_boosts_matches_near_keyword_only() { let recognizer = PatternRecognizer::builder() .with_pattern(regex) - .build() + .build_context_enhanced() .expect("recognizer builds"); // Two SSN-shaped numbers: one near the keyword, one not. @@ -41,7 +41,7 @@ async fn enhancer_boosts_matches_near_keyword_only() { assert_eq!(entities.len(), 2, "two SSN matches expected"); // First match has `SSN:` within the default 5-word prefix/suffix - // window and gets boosted by the Boosting wrapper. + // window and gets boosted by the ContextEnhanced wrapper. let near = entities .iter() .find(|e| &text[e.location.start..e.location.end] == "123-45-6789") diff --git a/crates/nvisy-pattern/tests/shipped_detection.rs b/crates/nvisy-pattern/tests/shipped_detection.rs index 1f6a30f9..32d0ac13 100644 --- a/crates/nvisy-pattern/tests/shipped_detection.rs +++ b/crates/nvisy-pattern/tests/shipped_detection.rs @@ -16,7 +16,7 @@ async fn scan(text: &str) -> (String, Vec>) { let recognizer = PatternRecognizer::builder() .with_builtin_patterns() .with_builtin_dictionaries() - .build() + .build_context_enhanced() .expect("shipped recognizer builds"); let input = RecognizerInput::new(TextData::new(text.to_owned())); let entities = recognizer diff --git a/crates/nvisy-pattern/tests/user_rules.rs b/crates/nvisy-pattern/tests/user_rules.rs index 589e6464..a3b477fd 100644 --- a/crates/nvisy-pattern/tests/user_rules.rs +++ b/crates/nvisy-pattern/tests/user_rules.rs @@ -37,7 +37,7 @@ async fn user_toml_rules_load_and_detect() { .with_pattern(product_code_pattern) .with_dictionary(product_code_dict) .with_builtin_patterns() - .build() + .build_context_enhanced() .expect("recognizer builds"); let text = include_str!("../testdata/inputs/internal.txt"); diff --git a/crates/nvisy-toolkit/Cargo.toml b/crates/nvisy-toolkit/Cargo.toml index 3a485c61..9b2a3e63 100644 --- a/crates/nvisy-toolkit/Cargo.toml +++ b/crates/nvisy-toolkit/Cargo.toml @@ -80,7 +80,7 @@ unicode-normalization = { workspace = true, features = [] } [dev-dependencies] # Internal test utilities (Entity::test_builder, …). nvisy-core = { workspace = true, features = ["test-utils"] } -# Boosting wrapper type returned by PatternRecognizer::build(). +# ContextEnhanced wrapper type returned by PatternRecognizer::build(). nvisy-context = { workspace = true, features = [] } # Codec front door for E2E pipeline tests. Production builds pull # nvisy-codec via the toolkit's per-modality features; the dev diff --git a/crates/nvisy-toolkit/examples/pipeline.rs b/crates/nvisy-toolkit/examples/pipeline.rs index d4703f4f..01554730 100644 --- a/crates/nvisy-toolkit/examples/pipeline.rs +++ b/crates/nvisy-toolkit/examples/pipeline.rs @@ -56,7 +56,7 @@ async fn main() -> Result<()> { let pattern = PatternRecognizer::builder() .with_builtin_patterns() .with_builtin_dictionaries() - .build()?; + .build_context_enhanced()?; let detection = RecognizerRegistry::new().with_recognizer(pattern); let input = RecognizerInput::new(TextData::new(SAMPLE.to_owned())); diff --git a/crates/nvisy-toolkit/tests/fixtures/registries.rs b/crates/nvisy-toolkit/tests/fixtures/registries.rs index 3c77d299..30098801 100644 --- a/crates/nvisy-toolkit/tests/fixtures/registries.rs +++ b/crates/nvisy-toolkit/tests/fixtures/registries.rs @@ -1,7 +1,7 @@ //! Shared recognizer + redaction registry constructors and dedup //! params used by every codec E2E test. -use nvisy_context::Boosting; +use nvisy_context::ContextEnhanced; use nvisy_core::entity::builtins; use nvisy_core::modality::Modality; use nvisy_core::primitive::ConfidenceThreshold; @@ -11,12 +11,12 @@ use nvisy_toolkit::redaction::anonymizer::{Mask, Replace}; use nvisy_toolkit::redaction::{Anonymizer, RedactionRegistry}; /// Build the shipped pattern recognizer from every built-in -/// pattern + dictionary, wrapped in its [`Boosting`] layer. -pub fn shipped_recognizer() -> Boosting { +/// pattern + dictionary, wrapped in its [`ContextEnhanced`] layer. +pub fn shipped_recognizer() -> ContextEnhanced { PatternRecognizer::builder() .with_builtin_patterns() .with_builtin_dictionaries() - .build() + .build_context_enhanced() .expect("shipped recognizer builds") } diff --git a/crates/nvisy-toolkit/tests/recognition_registry.rs b/crates/nvisy-toolkit/tests/recognition_registry.rs index 80460d41..640afb74 100644 --- a/crates/nvisy-toolkit/tests/recognition_registry.rs +++ b/crates/nvisy-toolkit/tests/recognition_registry.rs @@ -48,7 +48,7 @@ fn build_registry() -> RecognizerRegistry { let pattern = PatternRecognizer::builder() .with_builtin_patterns() .with_builtin_dictionaries() - .build() + .build_context_enhanced() .expect("pattern recognizer builds from builtin set"); let bento_url = env_or("NVISY_BENTO_URL", "http://localhost:3000"); From 756a9fa0e47b15635796239f5d1ccb50bd18afa7 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 15 Jun 2026 02:42:04 +0200 Subject: [PATCH 07/14] feat(pattern,context): per-language context keywords + primary-subtag matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvisy-pattern: new `Context` enum (Global | PerLanguage) replaces `Vec` on Regex and Dictionary. Untagged serde keeps the flat TOML form working unchanged; new form is `[context.en] = [...]`. Shipped phone, credit_card, date_of_birth, datetime patterns now carry EN/ES/DE/FR keyword sets. - nvisy-context: BoostRule gains `language: Option`. Enhancer storage flips to `HashMap>` — one bucket per label, distinct language scopes inside. Enhancer::enhance takes a language hint; ContextEnhanced threads input.language through. - nvisy-core: LanguageTag::new returns nvisy_core::Error; LanguageTag::matches compares primary subtags case-insensitively so `en` matches `en-US` / `en-GB`. BoostRule::applies_to_language and RecognizerInput::applies_to_language switch from `==` to `matches()` so language-scoped rules fire under regional variants. - Tests: TOML round-trip both forms; per-language boost fires for matching language; no boost for non-matching language; no-hint unions all per-language keywords; regional variants (`en-US`) trigger `en`-scoped rules. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 1 + crates/nvisy-context/src/enhancer.rs | 105 ++++++--- crates/nvisy-context/src/rule.rs | 56 ++++- crates/nvisy-context/src/wrapper.rs | 4 +- .../nvisy-core/src/primitive/language/tag.rs | 75 ++++++ crates/nvisy-core/src/recognition/input.rs | 6 +- crates/nvisy-pattern/Cargo.toml | 1 + .../assets/patterns/contact/phone.toml | 7 +- .../assets/patterns/finance/credit_card.toml | 7 +- .../patterns/personal/date_of_birth.toml | 7 +- .../assets/patterns/personal/datetime.toml | 7 +- crates/nvisy-pattern/src/lib.rs | 2 +- .../nvisy-pattern/src/recognition/context.rs | 163 +++++++++++++ .../src/recognition/dictionary.rs | 8 +- crates/nvisy-pattern/src/recognition/mod.rs | 2 + .../src/recognition/recognizer.rs | 220 +++++++++++++++++- crates/nvisy-pattern/src/recognition/regex.rs | 7 +- 17 files changed, 621 insertions(+), 57 deletions(-) create mode 100644 crates/nvisy-pattern/src/recognition/context.rs diff --git a/Cargo.lock b/Cargo.lock index 5d31d114..5028cb9d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3034,6 +3034,7 @@ dependencies = [ "async-trait", "csv", "derive_builder", + "derive_more", "nvisy-context", "nvisy-core", "regex", diff --git a/crates/nvisy-context/src/enhancer.rs b/crates/nvisy-context/src/enhancer.rs index cd3103b1..176efcc9 100644 --- a/crates/nvisy-context/src/enhancer.rs +++ b/crates/nvisy-context/src/enhancer.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use nvisy_core::entity::{Entity, EntityLabelRef, TrailStep}; use nvisy_core::modality::Text; +use nvisy_core::primitive::LanguageTag; use unicode_segmentation::UnicodeSegmentation; use super::matcher::KeywordMatcher; @@ -33,28 +34,37 @@ const TRAIL_SOURCE: &str = "context"; /// [`SubstringMatcher`]: super::SubstringMatcher /// [`LemmaMatcher`]: super::LemmaMatcher pub struct Enhancer { - rules: HashMap, + /// Rules bucketed by label. Within one bucket, each entry is + /// a distinct `(language)` scope; rules sharing the same + /// `(label, language)` are pre-merged via [`BoostRule::merge`] + /// at construction. Per-entity application looks up the + /// bucket once by label, then walks the small inner vec + /// filtering on the per-call language hint. + rules: HashMap>, matcher: Box, } impl Enhancer { /// Construct from a rule iterator and matcher. Rules sharing - /// the same label are merged via [`BoostRule::merge`]. + /// the same `(label, language)` are merged via + /// [`BoostRule::merge`]; rules with the same label but + /// distinct languages live as separate entries inside the + /// label's bucket. pub fn new( rules: impl IntoIterator, matcher: Box, ) -> Self { - let mut map: HashMap = HashMap::new(); + let mut buckets: HashMap> = HashMap::new(); for rule in rules { - match map.get_mut(&rule.label) { - Some(existing) => existing.merge(rule), - None => { - map.insert(rule.label.clone(), rule); - } + let bucket = buckets.entry(rule.label.clone()).or_default(); + if let Some(existing) = bucket.iter_mut().find(|r| r.language == rule.language) { + existing.merge(rule); + } else { + bucket.push(rule); } } Self { - rules: map, + rules: buckets, matcher, } } @@ -75,7 +85,8 @@ impl Enhancer { } /// Apply boost rules to `entities` in place. For each entity: - /// look up the rule for its label, walk a window of + /// walk every rule registered for its label whose language + /// scope applies under `language`, walk a window of /// `prefix_words` words before and `suffix_words` words after /// the entity's location, ask the matcher whether any keyword /// fires, and on a hit lift confidence by the rule's `boost` @@ -87,25 +98,55 @@ impl Enhancer { /// against the token stream; when absent, words are derived /// from the source text via Unicode word segmentation. /// + /// `language` is the per-call language hint. `None` means + /// "unknown" — every per-language rule applies as a + /// permissive fallback. + /// /// [`Confidence`]: nvisy_core::primitive::Confidence /// [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement - pub fn enhance(&self, entities: &mut [Entity], text: &str, tokens: Option<&[Token]>) { + pub fn enhance( + &self, + entities: &mut [Entity], + text: &str, + tokens: Option<&[Token]>, + language: Option<&LanguageTag>, + ) { if self.rules.is_empty() { return; } for entity in entities { - self.enhance_one(entity, text, tokens); + self.enhance_one(entity, text, tokens, language); } } - fn enhance_one(&self, entity: &mut Entity, text: &str, tokens: Option<&[Token]>) { - let Some(rule) = self.rules.get(&entity.label) else { + fn enhance_one( + &self, + entity: &mut Entity, + text: &str, + tokens: Option<&[Token]>, + language: Option<&LanguageTag>, + ) { + let Some(bucket) = self.rules.get(&entity.label) else { return; }; - if rule.keywords.is_empty() { - return; + for rule in bucket { + if !rule.applies_to_language(language) { + continue; + } + if rule.keywords.is_empty() { + continue; + } + self.apply_rule(entity, rule, text, tokens); } + } + fn apply_rule( + &self, + entity: &mut Entity, + rule: &BoostRule, + text: &str, + tokens: Option<&[Token]>, + ) { let start = entity.location.start; let end = entity.location.end; @@ -311,7 +352,7 @@ mod tests { )]); let text = "Your SSN: 123-45-6789"; let mut entities = vec![entity(govid_label(), 10, 21, 0.6)]; - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert!(entities[0].confidence.get() > 0.6); assert!( entities[0] @@ -326,7 +367,7 @@ mod tests { let enhancer = enhancer(vec![rule(govid_label(), &["social"], 0, 5, 0.2)]); let text = "123-45-6789 (social security number)"; let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert!( entities[0].confidence.get() > 0.6, "trailing keyword within suffix window should boost", @@ -340,7 +381,7 @@ mod tests { let text = "123-45-6789 (social security number)"; let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert_eq!(entities[0].confidence.get(), before); } @@ -350,7 +391,7 @@ mod tests { let text = "Mr. Smith is named in the report."; let mut entities = vec![entity(person_label(), 4, 9, 0.5)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert_eq!(entities[0].confidence.get(), before); } @@ -364,7 +405,7 @@ mod tests { let xyz_end = xyz_start + "XYZ".len(); let mut entities = vec![entity(govid_label(), xyz_start, xyz_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert_eq!(entities[0].confidence.get(), before); } @@ -373,7 +414,7 @@ mod tests { let enhancer = enhancer(vec![rule(govid_label(), &["here"], 5, 5, 0.9)]); let text = "the value is right here in plain sight"; let mut entities = vec![entity(govid_label(), 16, 21, 0.95)]; - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert!((entities[0].confidence.get() - 1.0).abs() < f64::EPSILON); } @@ -397,7 +438,7 @@ mod tests { let ssn_entity_start = ssn_only.find("123").unwrap(); let ssn_entity_end = ssn_entity_start + "123-45-6789".len(); let mut from_first = vec![entity(govid_label(), ssn_entity_start, ssn_entity_end, 0.6)]; - make_enhancer().enhance(&mut from_first, ssn_only, None); + make_enhancer().enhance(&mut from_first, ssn_only, None, None); assert!( from_first[0].confidence.get() > 0.6, "keyword `ssn` from the first rule must still boost after merge", @@ -408,7 +449,7 @@ mod tests { let tax_entity_start = taxid_only.find("987").unwrap(); let tax_entity_end = tax_entity_start + "987-65-4329".len(); let mut from_second = vec![entity(govid_label(), tax_entity_start, tax_entity_end, 0.6)]; - make_enhancer().enhance(&mut from_second, taxid_only, None); + make_enhancer().enhance(&mut from_second, taxid_only, None, None); assert!( from_second[0].confidence.get() > 0.6, "keyword `tax id` from the second rule must still boost after merge", @@ -423,7 +464,7 @@ mod tests { let entity_start = text.find("123").unwrap(); let entity_end = entity_start + "123-45-6789".len(); let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert!( entities[0].confidence.get() > 0.6, "unicode word should be reachable within 3-word prefix", @@ -439,7 +480,7 @@ mod tests { let entity_end = entity_start + "123-45-6789".len(); let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None); + enhancer.enhance(&mut entities, text, None, None); assert_eq!(entities[0].confidence.get(), before); } @@ -453,8 +494,8 @@ mod tests { let text = "Your SSN: 123-45-6789"; let mut from_none = vec![entity(govid_label(), 10, 21, 0.6)]; let mut from_empty = vec![entity(govid_label(), 10, 21, 0.6)]; - enhancer.enhance(&mut from_none, text, None); - enhancer.enhance(&mut from_empty, text, Some(&[])); + enhancer.enhance(&mut from_none, text, None, None); + enhancer.enhance(&mut from_empty, text, Some(&[]), None); assert_eq!( from_none[0].confidence.get(), from_empty[0].confidence.get(), @@ -486,7 +527,7 @@ mod tests { ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, Some(&tokens)); + enhancer.enhance(&mut entities, text, Some(&tokens), None); assert_eq!( entities[0].confidence.get(), before, @@ -508,7 +549,7 @@ mod tests { Token::from_text("123-45-6789", 22..33), ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens)); + enhancer.enhance(&mut entities, text, Some(&tokens), None); assert!( entities[0].confidence.get() > 0.6, "2-word prefix should reach the `social security` token", @@ -537,7 +578,7 @@ mod tests { Token::from_text("system", 41..47), ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens)); + enhancer.enhance(&mut entities, text, Some(&tokens), None); assert!( entities[0].confidence.get() > 0.6, "lemma matcher should match `run` against the `running` token's lemma", @@ -570,7 +611,7 @@ mod tests { Token::from_text("document", 18..26), ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens)); + enhancer.enhance(&mut entities, text, Some(&tokens), None); assert!( entities[0].confidence.get() > 0.6, "tokens that don't overlap the entity must fall back to the word window", diff --git a/crates/nvisy-context/src/rule.rs b/crates/nvisy-context/src/rule.rs index 7f88cf78..f45c423a 100644 --- a/crates/nvisy-context/src/rule.rs +++ b/crates/nvisy-context/src/rule.rs @@ -20,7 +20,7 @@ use std::collections::HashSet; use hipstr::HipStr; use nvisy_core::entity::EntityLabelRef; -use nvisy_core::primitive::Confidence; +use nvisy_core::primitive::{Confidence, LanguageTag}; /// Default window radius in words *before* an entity match. /// Mirrors Presidio's `context_prefix_count = 5`. @@ -49,6 +49,11 @@ pub struct BoostRule { /// /// [`label`]: nvisy_core::entity::Entity::label pub label: EntityLabelRef, + /// Language scope. `None` means the rule applies regardless + /// of the per-call language hint; `Some(lang)` means the rule + /// only fires when the caller's language matches, or when no + /// hint is set (permissive fallback). + pub language: Option, /// Keywords whose presence near a match lifts the entity's /// confidence. Stored as [`HipStr`] for cheap clones across /// per-pass rule sets. @@ -71,8 +76,12 @@ pub struct BoostRule { impl BoostRule { /// Construct a rule for `label` with explicit window radii - /// and `boost`. Most callers want [`BoostRule::for_label`] - /// instead — it bakes in the default window / boost values. + /// and `boost`. The rule is language-agnostic; use + /// [`with_language`] to scope it. Most callers want + /// [`BoostRule::for_label`] instead — it bakes in the default + /// window / boost values. + /// + /// [`with_language`]: Self::with_language #[must_use] pub fn new( label: EntityLabelRef, @@ -83,6 +92,7 @@ impl BoostRule { ) -> Self { Self { label, + language: None, keywords: keywords.into_iter().map(Into::into).collect(), prefix_words, suffix_words, @@ -113,6 +123,36 @@ impl BoostRule { ) } + /// Scope this rule to a single language. + /// + /// At apply time the rule fires only when the caller's + /// language hint matches `language`, or when no hint is set + /// (permissive fallback). + #[must_use] + pub fn with_language(mut self, language: LanguageTag) -> Self { + self.language = Some(language); + self + } + + /// Return `true` when this rule applies under the per-call + /// language hint. + /// + /// - Language-agnostic rules (`self.language == None`) + /// always apply. + /// - Language-scoped rules apply when the hint shares a + /// primary subtag with the scope (so a rule scoped to + /// `"en"` fires for `"en-US"` and `"en-GB"` hints), or + /// when no hint is set (permissive fallback so callers + /// who don't pass a language still get boosts). + #[must_use] + pub fn applies_to_language(&self, hint: Option<&LanguageTag>) -> bool { + match (&self.language, hint) { + (None, _) => true, + (Some(_), None) => true, + (Some(scope), Some(hint)) => scope.matches(hint), + } + } + /// Merge `other` into this rule by extending the keyword set /// with any keywords not already present. Window radii and /// `boost` are kept from `self` — callers that need different @@ -121,14 +161,18 @@ impl BoostRule { /// /// # Panics /// - /// Debug-asserts when the labels differ. Merging across labels - /// is a caller bug — rules are keyed by label and the engine - /// looks them up by label. + /// Debug-asserts when the labels or languages differ. Merging + /// across keys is a caller bug — rules are keyed by + /// `(label, language)` and the engine looks them up by both. pub fn merge(&mut self, other: BoostRule) { debug_assert_eq!( self.label, other.label, "BoostRule::merge requires matching labels", ); + debug_assert_eq!( + self.language, other.language, + "BoostRule::merge requires matching languages", + ); let existing: HashSet<&str> = self.keywords.iter().map(HipStr::as_str).collect(); let additions: Vec> = other .keywords diff --git a/crates/nvisy-context/src/wrapper.rs b/crates/nvisy-context/src/wrapper.rs index c4128f0f..c9d415ec 100644 --- a/crates/nvisy-context/src/wrapper.rs +++ b/crates/nvisy-context/src/wrapper.rs @@ -70,7 +70,9 @@ where } let text = input.data.text.as_str(); let tokens = input.artifacts.get::().map(Tokens::as_slice); - self.enhancer.enhance(&mut output.entities, text, tokens); + let language = input.language.as_ref(); + self.enhancer + .enhance(&mut output.entities, text, tokens, language); Ok(output) } } diff --git a/crates/nvisy-core/src/primitive/language/tag.rs b/crates/nvisy-core/src/primitive/language/tag.rs index e0da8fac..412dc65b 100644 --- a/crates/nvisy-core/src/primitive/language/tag.rs +++ b/crates/nvisy-core/src/primitive/language/tag.rs @@ -3,6 +3,8 @@ use derive_more::{Display, FromStr}; use serde::{Deserialize, Serialize}; +use crate::Error; + /// A validated [BCP-47] language tag. /// /// Wraps [`LanguageTag`] with serde support. Use `#[schemars(with = @@ -26,6 +28,24 @@ use serde::{Deserialize, Serialize}; pub struct LanguageTag(oxilangtag::LanguageTag); impl LanguageTag { + /// Parse a BCP-47 language tag from a string. + /// + /// Convenience over the `FromStr` impl when the input is + /// already a `&str` literal. + /// + /// # Errors + /// + /// Returns a validation error when `tag` is not a valid + /// BCP-47 tag. + pub fn new(tag: &str) -> Result { + tag.parse().map_err(|e| { + Error::validation( + format!("invalid BCP-47 language tag `{tag}`: {e}"), + "nvisy-core", + ) + }) + } + /// Returns the tag as a string slice. pub fn as_str(&self) -> &str { self.0.as_str() @@ -35,4 +55,59 @@ impl LanguageTag { pub fn primary_language(&self) -> &str { self.0.primary_language() } + + /// Return `true` when `self` and `other` share the same + /// primary language subtag. + /// + /// Compares only the primary subtag, so `"en"`, `"en-US"`, and + /// `"en-GB"` all match each other; `"en"` does not match + /// `"de"`. ASCII case-insensitive (BCP-47 tags are + /// case-insensitive by spec). + /// + /// # Examples + /// + /// ``` + /// use nvisy_core::primitive::LanguageTag; + /// + /// let en = LanguageTag::new("en").unwrap(); + /// let en_us = LanguageTag::new("en-US").unwrap(); + /// let de = LanguageTag::new("de").unwrap(); + /// + /// assert!(en.matches(&en_us)); + /// assert!(en_us.matches(&en)); + /// assert!(!en.matches(&de)); + /// ``` + #[must_use] + pub fn matches(&self, other: &Self) -> bool { + self.primary_language() + .eq_ignore_ascii_case(other.primary_language()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn tag(s: &str) -> LanguageTag { + LanguageTag::new(s).expect("valid BCP-47 tag") + } + + #[test] + fn matches_same_primary_subtag() { + assert!(tag("en").matches(&tag("en-US"))); + assert!(tag("en-US").matches(&tag("en"))); + assert!(tag("en-US").matches(&tag("en-GB"))); + assert!(tag("en").matches(&tag("en"))); + } + + #[test] + fn matches_rejects_distinct_primary_subtags() { + assert!(!tag("en").matches(&tag("de"))); + assert!(!tag("en-US").matches(&tag("de-DE"))); + } + + #[test] + fn matches_is_case_insensitive() { + assert!(tag("EN").matches(&tag("en-us"))); + } } diff --git a/crates/nvisy-core/src/recognition/input.rs b/crates/nvisy-core/src/recognition/input.rs index 0594b5a4..0192878c 100644 --- a/crates/nvisy-core/src/recognition/input.rs +++ b/crates/nvisy-core/src/recognition/input.rs @@ -127,7 +127,9 @@ impl RecognizerInput { /// - An empty `allowed` list means the rule is language-agnostic /// and always runs. /// - When `allowed` is non-empty and [`language`] is `Some(_)`, - /// the rule runs only if the hint is in the list. + /// the rule runs when the hint shares a primary subtag with + /// any entry in `allowed` (so an `["en"]` rule fires for + /// `"en-US"` and `"en-GB"` hints). /// - When [`language`] is `None`, the rule still runs — we can't /// disprove applicability without a hint. /// @@ -138,7 +140,7 @@ impl RecognizerInput { return true; } match self.language.as_ref() { - Some(l) => allowed.iter().any(|a| a == l), + Some(hint) => allowed.iter().any(|a| a.matches(hint)), None => true, } } diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index d89d43fc..08fe0410 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -33,6 +33,7 @@ toml = { workspace = true, features = ["parse"] } # Derive macros and error handling derive_builder = { workspace = true, features = [] } +derive_more = { workspace = true, features = ["from"] } # Async runtime and parallelism async-trait = { workspace = true, features = [] } diff --git a/crates/nvisy-pattern/assets/patterns/contact/phone.toml b/crates/nvisy-pattern/assets/patterns/contact/phone.toml index 01df2224..ce5fe9dd 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/phone.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/phone.toml @@ -1,6 +1,11 @@ name = "phone" label = "phone_number" -context = ["phone", "call", "mobile", "tel", "fax", "contact"] + +[context] +en = ["phone", "call", "mobile", "tel", "fax", "contact"] +es = ["teléfono", "telefono", "llamar", "móvil", "movil", "celular", "tel", "fax", "contacto"] +de = ["telefon", "anruf", "mobil", "handy", "tel", "fax", "kontakt"] +fr = ["téléphone", "telephone", "appel", "mobile", "portable", "tel", "fax", "contact"] [[variants]] regex = "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b" diff --git a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml index 78b3325a..cfe4ab2b 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml @@ -1,6 +1,11 @@ name = "credit-card" label = "payment_card" -context = ["card", "credit", "debit", "payment", "visa", "mastercard", "amex"] + +[context] +en = ["card", "credit", "debit", "payment", "visa", "mastercard", "amex"] +es = ["tarjeta", "crédito", "credito", "débito", "debito", "pago", "visa", "mastercard", "amex"] +de = ["karte", "kredit", "kreditkarte", "debit", "zahlung", "visa", "mastercard", "amex"] +fr = ["carte", "crédit", "credit", "débit", "debit", "paiement", "visa", "mastercard", "amex"] [[variants]] regex = "\\b(?:\\d[ \\-]*?){13,19}\\b" diff --git a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml index c88f21b9..a178c41d 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml @@ -1,6 +1,11 @@ name = "date_of_birth" label = "date_of_birth" -context = ["birth", "born", "dob", "birthday"] + +[context] +en = ["birth", "born", "dob", "birthday", "date of birth"] +es = ["nacimiento", "nacido", "nacida", "fecha de nacimiento", "cumpleaños", "cumpleanos"] +de = ["geburt", "geboren", "geburtsdatum", "geburtstag"] +fr = ["naissance", "né", "nee", "née", "date de naissance", "anniversaire"] [[variants]] regex = "\\b(?:(?:0[1-9]|1[0-2]|[1-9])[/\\-](?:0[1-9]|[12]\\d|3[01]|[1-9])[/\\-](?:19|20)\\d{2}|(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01]))\\b" diff --git a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml index c1e00f7a..e37edf92 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml @@ -1,6 +1,11 @@ name = "datetime" label = "date_time" -context = ["timestamp", "created", "modified", "logged", "at", "time"] + +[context] +en = ["timestamp", "created", "modified", "logged", "at", "time"] +es = ["marca de tiempo", "creado", "creada", "modificado", "modificada", "registrado", "a las", "hora", "fecha"] +de = ["zeitstempel", "erstellt", "geändert", "geandert", "protokolliert", "um", "uhrzeit", "zeit"] +fr = ["horodatage", "créé", "cree", "créée", "creee", "modifié", "modifie", "à", "heure", "date"] [[variants]] regex = "\\b(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[T ](?:[01]\\d|2[0-3]):[0-5]\\d(?::[0-5]\\d)?(?:Z|[+\\-]\\d{2}:?\\d{2})?\\b" diff --git a/crates/nvisy-pattern/src/lib.rs b/crates/nvisy-pattern/src/lib.rs index 129b002f..ca00f5f9 100644 --- a/crates/nvisy-pattern/src/lib.rs +++ b/crates/nvisy-pattern/src/lib.rs @@ -7,7 +7,7 @@ mod shipped; pub mod validators; pub use self::recognition::{ - Dictionary, DictionaryBuilder, PatternRecognizer, PatternRecognizerBuilder, Regex, + Context, Dictionary, DictionaryBuilder, PatternRecognizer, PatternRecognizerBuilder, Regex, RegexBuilder, Scoring, Term, Variant, }; pub use self::shipped::{dictionaries, patterns}; diff --git a/crates/nvisy-pattern/src/recognition/context.rs b/crates/nvisy-pattern/src/recognition/context.rs new file mode 100644 index 00000000..90f37504 --- /dev/null +++ b/crates/nvisy-pattern/src/recognition/context.rs @@ -0,0 +1,163 @@ +//! [`Context`]: per-rule keyword set used by the post-recognition +//! [`ContextEnhanced`] layer. +//! +//! Two shapes: +//! +//! - [`Global`] — one flat keyword list applied regardless of the +//! per-call language hint. +//! - [`PerLanguage`] — keyword lists keyed by [`LanguageTag`]; the +//! enhancer picks the entry matching `RecognizerInput.language`. +//! When no language hint is set, the union of every per-language +//! keyword fires (matches the crate's "missing language = any" +//! theme used by [`Regex::languages`] / [`Dictionary::languages`]). +//! +//! [`Global`]: Context::Global +//! [`PerLanguage`]: Context::PerLanguage +//! [`ContextEnhanced`]: nvisy_context::ContextEnhanced +//! [`Regex::languages`]: super::Regex::languages +//! [`Dictionary::languages`]: super::Dictionary::languages + +use std::collections::HashMap; +use std::collections::hash_map::Iter; + +use derive_more::From; +use nvisy_core::primitive::LanguageTag; +use serde::Deserialize; + +/// Per-rule context keyword set. +/// +/// Either a single flat list ([`Global`]) or a map keyed by +/// language ([`PerLanguage`]). +/// +/// [`Global`]: Self::Global +/// [`PerLanguage`]: Self::PerLanguage +#[derive(Debug, Clone, PartialEq, Eq, Deserialize, From)] +#[serde(untagged)] +pub enum Context { + /// One flat keyword list applied regardless of the per-call + /// language hint. + Global(Vec), + /// Per-language keyword lists. The enhancer picks the entry + /// matching `RecognizerInput.language`, or unions every list + /// when no hint is set. + PerLanguage(HashMap>), +} + +impl Context { + /// Return `true` when no keywords are declared in any scope. + #[must_use] + pub fn is_empty(&self) -> bool { + match self { + Self::Global(kws) => kws.is_empty(), + Self::PerLanguage(map) => map.values().all(Vec::is_empty), + } + } + + /// Iterate over `(language, keywords)` pairs. + /// + /// [`Global`] yields one entry with `language = None`; + /// [`PerLanguage`] yields one entry per language. + /// + /// [`Global`]: Self::Global + /// [`PerLanguage`]: Self::PerLanguage + pub fn iter(&self) -> ContextIter<'_> { + match self { + Self::Global(kws) => ContextIter::Global(Some(kws.as_slice())), + Self::PerLanguage(map) => ContextIter::PerLanguage(map.iter()), + } + } +} + +impl Default for Context { + fn default() -> Self { + Self::Global(Vec::new()) + } +} + +/// Iterator returned by [`Context::iter`]. +pub enum ContextIter<'a> { + Global(Option<&'a [String]>), + PerLanguage(Iter<'a, LanguageTag, Vec>), +} + +impl<'a> Iterator for ContextIter<'a> { + type Item = (Option<&'a LanguageTag>, &'a [String]); + + fn next(&mut self) -> Option { + match self { + Self::Global(slot) => slot.take().map(|kws| (None, kws)), + Self::PerLanguage(it) => it.next().map(|(lang, kws)| (Some(lang), kws.as_slice())), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Deserialize)] + struct Wrap { + context: Context, + } + + #[test] + fn parses_flat_array_as_global() { + let toml = r#"context = ["a", "b"]"#; + let w: Wrap = toml::from_str(toml).unwrap(); + assert_eq!(w.context, Context::Global(vec!["a".into(), "b".into()])); + } + + #[test] + fn parses_table_as_per_language() { + let toml = r#" + [context] + en = ["card"] + es = ["tarjeta"] + "#; + let w: Wrap = toml::from_str(toml).unwrap(); + let map = match w.context { + Context::PerLanguage(m) => m, + _ => panic!("expected PerLanguage"), + }; + assert_eq!(map.len(), 2); + assert_eq!( + map.get(&LanguageTag::new("en").unwrap()).unwrap(), + &vec!["card".to_owned()] + ); + assert_eq!( + map.get(&LanguageTag::new("es").unwrap()).unwrap(), + &vec!["tarjeta".to_owned()] + ); + } + + #[test] + fn iter_global_yields_one_none_entry() { + let ctx = Context::Global(vec!["a".into(), "b".into()]); + let collected: Vec<_> = ctx + .iter() + .map(|(lang, kws)| (lang.cloned(), kws.to_vec())) + .collect(); + assert_eq!(collected.len(), 1); + assert!(collected[0].0.is_none()); + assert_eq!(collected[0].1, vec!["a".to_owned(), "b".to_owned()]); + } + + #[test] + fn iter_per_language_yields_one_entry_per_language() { + let mut map = HashMap::new(); + map.insert(LanguageTag::new("en").unwrap(), vec!["card".into()]); + map.insert(LanguageTag::new("es").unwrap(), vec!["tarjeta".into()]); + let ctx = Context::PerLanguage(map); + let collected: Vec<_> = ctx + .iter() + .map(|(lang, kws)| (lang.unwrap().to_string(), kws.to_vec())) + .collect(); + assert_eq!(collected.len(), 2); + } + + #[test] + fn default_is_empty_global() { + let ctx = Context::default(); + assert!(ctx.is_empty()); + } +} diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index 3285f7af..3be046c6 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -6,6 +6,7 @@ use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use serde::Deserialize; +use super::context::Context; use super::term::Term; /// Confidence policy for a [`Dictionary`]'s matches. @@ -129,10 +130,11 @@ pub struct Dictionary { #[serde(default, rename = "score")] pub scoring: Scoring, /// Context keywords that lift confidence when one of them - /// appears near a match. + /// appears near a match. Either a flat list applied + /// regardless of language, or a per-language map. #[builder(default)] #[serde(default)] - pub context: Vec, + pub context: Context, /// BCP-47 language tags the dictionary applies to. Empty means /// "any language"; otherwise the recognizer skips the /// dictionary when the per-call language hint is not in the @@ -222,7 +224,7 @@ struct DictionaryMetadata { #[serde(default)] score: Option, #[serde(default)] - context: Option>, + context: Option, #[serde(default)] word_boundary: Option, } diff --git a/crates/nvisy-pattern/src/recognition/mod.rs b/crates/nvisy-pattern/src/recognition/mod.rs index e9be1ed0..7dff66a5 100644 --- a/crates/nvisy-pattern/src/recognition/mod.rs +++ b/crates/nvisy-pattern/src/recognition/mod.rs @@ -9,11 +9,13 @@ //! on matches near a declared keyword. mod compiled; +mod context; mod dictionary; mod recognizer; mod regex; mod term; +pub use self::context::Context; pub use self::dictionary::{Dictionary, DictionaryBuilder, Scoring}; pub use self::recognizer::{PatternRecognizer, PatternRecognizerBuilder}; pub use self::regex::{Regex, RegexBuilder, Variant}; diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index f4ad3db9..3423e80b 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -4,6 +4,7 @@ use aho_corasick::{AhoCorasick, MatchKind}; use nvisy_context::{BoostRule, ContextEnhanced, Enhancer, SubstringMatcher}; use nvisy_core::entity::{Entity, EntityLabelCatalog, EntityLabelRef}; use nvisy_core::modality::Text; +use nvisy_core::primitive::LanguageTag; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; use nvisy_core::{Error, Result}; use regex::RegexSet; @@ -365,27 +366,53 @@ impl PatternRecognizerBuilder { /// Build the wrapping [`Enhancer`] from per-pattern and /// per-dictionary context keywords. + /// + /// Per-rule [`Context`] produces one [`BoostRule`] per + /// language scope (global rules carry + /// `language = None`; per-language rules carry the language + /// tag). The enhancer keys these by label and filters them + /// against the per-call language hint at apply time. + /// + /// [`Context`]: super::Context fn build_enhancer(&self) -> Enhancer { let boost_rules: Vec = self .context_keywords() - .map(|(label, keywords)| BoostRule::for_label(label.clone(), keywords.iter().cloned())) + .map(|(label, language, keywords)| { + let rule = BoostRule::for_label(label.clone(), keywords.iter().cloned()); + match language { + Some(lang) => rule.with_language(lang.clone()), + None => rule, + } + }) .collect(); Enhancer::new(boost_rules, Box::new(SubstringMatcher)) } - /// Yield `(label, keywords)` for every pattern and dictionary - /// that declares a non-empty context. - fn context_keywords(&self) -> impl Iterator { + /// Yield `(label, language, keywords)` for every pattern and + /// dictionary that declares a non-empty context. Global + /// keywords carry `language = None`; per-language keywords + /// carry `Some(tag)`. + fn context_keywords( + &self, + ) -> impl Iterator, &[String])> { let pattern_keywords = self .patterns .iter() .filter(|p| !p.context.is_empty()) - .map(|p| (&p.label, p.context.as_slice())); + .flat_map(|p| { + p.context + .iter() + .map(move |(lang, kws)| (&p.label, lang, kws)) + }); let dict_keywords = self .dictionaries .iter() .filter(|d| !d.context.is_empty()) - .map(|d| (&d.label, d.context.as_slice())); + .flat_map(|d| { + d.context + .iter() + .map(move |(lang, kws)| (&d.label, lang, kws)) + }); pattern_keywords.chain(dict_keywords) } } @@ -436,8 +463,11 @@ impl EntityRecognizer for PatternRecognizer { #[cfg(test)] mod tests { + use std::collections::HashMap; + use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::{Text, TextData}; + use nvisy_core::primitive::Confidence; use nvisy_core::recognition::RecognizerInput; use super::*; @@ -492,4 +522,182 @@ mod tests { let entities = run(&recognizer, "example").await; assert_eq!(entities.len(), 1, "substring match must be kept"); } + + #[test] + fn regex_parses_flat_context_as_global() { + let toml = r#" + name = "x" + label = "government_id" + context = ["ssn", "social security"] + [[variants]] + regex = "\\d+" + "#; + let regex = crate::Regex::from_toml(toml).expect("flat-context TOML parses"); + assert!(matches!(regex.context, crate::Context::Global(_))); + } + + #[test] + fn regex_parses_table_context_as_per_language() { + let toml = r#" + name = "x" + label = "payment_card" + [context] + en = ["card", "credit"] + es = ["tarjeta", "crédito"] + [[variants]] + regex = "\\d+" + "#; + let regex = crate::Regex::from_toml(toml).expect("table-context TOML parses"); + let map = match regex.context { + crate::Context::PerLanguage(m) => m, + _ => panic!("expected PerLanguage"), + }; + assert_eq!(map.len(), 2); + } + + async fn run_with_language( + recognizer: &impl EntityRecognizer, + text: &str, + language: Option<&str>, + ) -> Vec> { + let mut input = RecognizerInput::new(TextData::new(text.to_owned())); + if let Some(lang) = language { + input = input.with_language(LanguageTag::new(lang).expect("language tag parses")); + } + recognizer + .recognize(&input) + .await + .expect("recognize succeeds") + .entities + } + + fn per_language_credit_card_regex() -> crate::Regex { + let variant = crate::Variant::new(r"\b\d{16}\b") + .expect("variant builds") + .with_score(Confidence::clamped(0.5)); + let mut context = HashMap::new(); + context.insert( + LanguageTag::new("en").unwrap(), + vec!["credit".to_owned(), "card".to_owned()], + ); + context.insert( + LanguageTag::new("es").unwrap(), + vec!["tarjeta".to_owned(), "crédito".to_owned()], + ); + crate::Regex::builder() + .with_name("credit_card") + .with_label(builtins::PAYMENT_CARD.label_ref()) + .with_context(crate::Context::PerLanguage(context)) + .with_variants(vec![variant]) + .build() + .expect("regex builds") + } + + #[tokio::test] + async fn per_language_boost_fires_for_matching_language() { + let recognizer = PatternRecognizer::builder() + .with_pattern(per_language_credit_card_regex()) + .build_context_enhanced() + .expect("recognizer builds"); + + let text = "Pay with your credit card 4111111111111111 today"; + let entities = run_with_language(&recognizer, text, Some("en")).await; + let card = entities + .iter() + .find(|e| &text[e.location.start..e.location.end] == "4111111111111111") + .expect("card match present"); + assert!( + card.confidence.get() > 0.5, + "English keyword `credit` should boost under en hint", + ); + } + + #[tokio::test] + async fn per_language_boost_fires_for_regional_variant() { + // Pattern is scoped `en`; hint is `en-US`. Primary subtag + // matches, so the boost must fire. + let recognizer = PatternRecognizer::builder() + .with_pattern(per_language_credit_card_regex()) + .build_context_enhanced() + .expect("recognizer builds"); + + let text = "Pay with your credit card 4111111111111111 today"; + let entities = run_with_language(&recognizer, text, Some("en-US")).await; + let card = entities + .iter() + .find(|e| &text[e.location.start..e.location.end] == "4111111111111111") + .expect("card match present"); + assert!( + card.confidence.get() > 0.5, + "`en-US` hint should fire the `en`-scoped boost", + ); + } + + #[tokio::test] + async fn rule_language_filter_accepts_regional_variant() { + // Pattern is scoped `languages = ["en"]`; the per-call + // hint is `en-US`. The rule must still run. + let variant = crate::Variant::new(r"\b\d{3}-\d{2}-\d{4}\b") + .expect("variant builds") + .with_score(Confidence::clamped(0.5)); + let regex = crate::Regex::builder() + .with_name("ssn") + .with_label(builtins::GOVERNMENT_ID.label_ref()) + .with_variants(vec![variant]) + .with_languages(vec![LanguageTag::new("en").unwrap()]) + .build() + .expect("regex builds"); + + let recognizer = PatternRecognizer::builder() + .with_pattern(regex) + .build() + .expect("recognizer builds"); + + let entities = run_with_language(&recognizer, "SSN: 123-45-6789", Some("en-US")).await; + assert_eq!( + entities.len(), + 1, + "`en`-scoped rule must run for `en-US` input", + ); + } + + #[tokio::test] + async fn per_language_boost_skipped_for_non_matching_language() { + let recognizer = PatternRecognizer::builder() + .with_pattern(per_language_credit_card_regex()) + .build_context_enhanced() + .expect("recognizer builds"); + + // English keywords near the match, but caller asserted Spanish. + let text = "Pay with your credit card 4111111111111111 today"; + let entities = run_with_language(&recognizer, text, Some("es")).await; + let card = entities + .iter() + .find(|e| &text[e.location.start..e.location.end] == "4111111111111111") + .expect("card match present"); + assert!( + (card.confidence.get() - 0.5).abs() < f64::EPSILON, + "English keywords must not boost under es hint", + ); + } + + #[tokio::test] + async fn no_language_hint_unions_per_language_keywords() { + let recognizer = PatternRecognizer::builder() + .with_pattern(per_language_credit_card_regex()) + .build_context_enhanced() + .expect("recognizer builds"); + + // English keyword near the match, no language hint set. + let text = "Pay with your credit card 4111111111111111 today"; + let entities = run_with_language(&recognizer, text, None).await; + let card = entities + .iter() + .find(|e| &text[e.location.start..e.location.end] == "4111111111111111") + .expect("card match present"); + assert!( + card.confidence.get() > 0.5, + "missing language hint should permit any per-language keyword to boost", + ); + } } diff --git a/crates/nvisy-pattern/src/recognition/regex.rs b/crates/nvisy-pattern/src/recognition/regex.rs index f084f4a0..6c926ff4 100644 --- a/crates/nvisy-pattern/src/recognition/regex.rs +++ b/crates/nvisy-pattern/src/recognition/regex.rs @@ -6,6 +6,8 @@ use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use serde::Deserialize; +use super::context::Context; + /// One regex strategy inside a [`Regex`] rule. /// /// A variant pairs a regex source with the confidence stamped on @@ -133,10 +135,11 @@ pub struct Regex { /// Entity label every variant emits. pub label: EntityLabelRef, /// Context keywords that lift confidence when one of them - /// appears near a match. + /// appears near a match. Either a flat list applied + /// regardless of language, or a per-language map. #[builder(default)] #[serde(default)] - pub context: Vec, + pub context: Context, /// Regex variants. At least one is required to produce matches; /// the recognizer skips rules with an empty variant list. pub variants: Vec, From 40d49750fbbc2d4bd776f64139319c03707670ba Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Mon, 15 Jun 2026 06:51:38 +0200 Subject: [PATCH 08/14] feat(toolkit): SuppressionLayer with allow-list false-positive filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New `nvisy-toolkit::deduplication::suppress::{SuppressionLayer, SuppressionParams}`. Three independent allow-list shapes apply by union: - `allow_values` — exact, ASCII case-insensitive - `allow_values_substring` — entity text contains the value - `allow_values_regex` — regex matched against entity text - Operates on the entity's resolved text via `TextAt::text_at`. Fail-open when the resolver returns `None`: keep the entity rather than silently drop something we can't verify. - Empty entries are filtered at construction (otherwise an empty substring would drop every entity via `str::contains("")`). - `LayerParams` gains a nested `suppression: SuppressionParams` field; `LayerPipeline::from_params` becomes fallible and inserts the new layer between fuse and resolve, growing the canonical recipe to: calibrate → filter → fuse → suppress → resolve. - Six `from_params` call sites updated (engine pipeline, engine tests, toolkit fixtures, toolkit example). - 15 unit tests in `suppress::tests` (exact / substring / regex modes, case insensitivity, partial-overlap semantics, empty- entry filtering, union across modes, unresolved-location fail-open, invalid-regex error). Plus a pipeline-order test in `pipeline::tests` that pins the architectural intent that fuse collapses before suppress sees, suppress drops before resolve adjudicates. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 1 + .../src/detection/phases/deduplication.rs | 2 +- crates/nvisy-engine/tests/deduplication.rs | 6 +- crates/nvisy-toolkit/Cargo.toml | 1 + crates/nvisy-toolkit/examples/pipeline.rs | 2 +- crates/nvisy-toolkit/src/deduplication/mod.rs | 5 +- .../nvisy-toolkit/src/deduplication/params.rs | 17 +- .../src/deduplication/pipeline.rs | 93 ++++- .../src/deduplication/suppress/mod.rs | 366 ++++++++++++++++++ .../src/deduplication/suppress/params.rs | 82 ++++ .../nvisy-toolkit/tests/fixtures/pipeline.rs | 2 + 11 files changed, 562 insertions(+), 15 deletions(-) create mode 100644 crates/nvisy-toolkit/src/deduplication/suppress/mod.rs create mode 100644 crates/nvisy-toolkit/src/deduplication/suppress/params.rs diff --git a/Cargo.lock b/Cargo.lock index 5028cb9d..d2403490 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3091,6 +3091,7 @@ dependencies = [ "nvisy-ocr", "nvisy-pattern", "nvisy-stt", + "regex", "schemars", "serde", "serde_json", diff --git a/crates/nvisy-engine/src/detection/phases/deduplication.rs b/crates/nvisy-engine/src/detection/phases/deduplication.rs index 8c3dd742..1a4019f4 100644 --- a/crates/nvisy-engine/src/detection/phases/deduplication.rs +++ b/crates/nvisy-engine/src/detection/phases/deduplication.rs @@ -120,7 +120,7 @@ where // rewrap without losing audit state. let records = mem::take(&mut tree.root.audit.records); let entities: Vec> = records.into_iter().map(|r| r.entity).collect(); - let pipeline: LayerPipeline = LayerPipeline::from_params(dedup); + let pipeline: LayerPipeline = LayerPipeline::from_params(dedup)?; let ctx = LayerContext::new(&*tree).with_correlation_id(run_id); let deduped = pipeline.run(entities, &ctx).await; tree.root.audit.records = deduped.into_iter().map(EntityRecord::new).collect(); diff --git a/crates/nvisy-engine/tests/deduplication.rs b/crates/nvisy-engine/tests/deduplication.rs index 49b497ce..b041f35a 100644 --- a/crates/nvisy-engine/tests/deduplication.rs +++ b/crates/nvisy-engine/tests/deduplication.rs @@ -83,7 +83,8 @@ async fn confidence_threshold_filters() { .with_confidence(conf(0.5)) .test_build(), ]; - let pipeline: LayerPipeline = LayerPipeline::from_params(¶ms); + let pipeline: LayerPipeline = + LayerPipeline::from_params(¶ms).expect("pipeline builds"); let ctx = LayerContext::new(&tree).with_correlation_id(Uuid::nil()); let result = pipeline.run(entities, &ctx).await; assert_eq!(result.len(), 1); @@ -111,7 +112,8 @@ async fn full_pipeline() { .with_confidence(conf(0.85)) .test_build(), ]; - let pipeline: LayerPipeline = LayerPipeline::from_params(&LayerParams::default()); + let pipeline: LayerPipeline = + LayerPipeline::from_params(&LayerParams::default()).expect("pipeline builds"); let ctx = LayerContext::new(&tree).with_correlation_id(Uuid::nil()); let result = pipeline.run(entities, &ctx).await; assert_eq!(result.len(), 1); diff --git a/crates/nvisy-toolkit/Cargo.toml b/crates/nvisy-toolkit/Cargo.toml index 9b2a3e63..f16707d0 100644 --- a/crates/nvisy-toolkit/Cargo.toml +++ b/crates/nvisy-toolkit/Cargo.toml @@ -76,6 +76,7 @@ tracing = { workspace = true, features = [] } # Text processing (unicode-aware folding for leak detection) unicode-normalization = { workspace = true, features = [] } +regex = { workspace = true, features = [] } [dev-dependencies] # Internal test utilities (Entity::test_builder, …). diff --git a/crates/nvisy-toolkit/examples/pipeline.rs b/crates/nvisy-toolkit/examples/pipeline.rs index 01554730..66c6c908 100644 --- a/crates/nvisy-toolkit/examples/pipeline.rs +++ b/crates/nvisy-toolkit/examples/pipeline.rs @@ -93,7 +93,7 @@ async fn main() -> Result<()> { ..LayerParams::default() }; let ctx = LayerContext::>::new(&source); - let dedup = LayerPipeline::>::from_params(¶ms); + let dedup = LayerPipeline::>::from_params(¶ms)?; let before = entities.len(); let entities = dedup.run(entities, &ctx).await; diff --git a/crates/nvisy-toolkit/src/deduplication/mod.rs b/crates/nvisy-toolkit/src/deduplication/mod.rs index 94852ada..c5cb05ab 100644 --- a/crates/nvisy-toolkit/src/deduplication/mod.rs +++ b/crates/nvisy-toolkit/src/deduplication/mod.rs @@ -30,7 +30,9 @@ //! 1. **Calibrate** raw confidence scores per-recognizer. //! 2. **Filter** by allowed kinds + confidence floor. //! 3. **Fuse** co-referent entities into one (group + combine). -//! 4. **Resolve conflicts** between different kinds on the same span. +//! 4. **Suppress** entities whose matched text is on a +//! caller-supplied allow list. +//! 5. **Resolve conflicts** between different kinds on the same span. //! //! Operators can swap steps, drop steps, or insert their own custom //! [`Layer`] impls by building the pipeline manually with @@ -40,6 +42,7 @@ pub mod calibrate; pub mod filter; pub mod fuse; pub mod resolve; +pub mod suppress; mod layer; mod params; diff --git a/crates/nvisy-toolkit/src/deduplication/params.rs b/crates/nvisy-toolkit/src/deduplication/params.rs index cc8ca7f8..8f73b6ae 100644 --- a/crates/nvisy-toolkit/src/deduplication/params.rs +++ b/crates/nvisy-toolkit/src/deduplication/params.rs @@ -1,17 +1,18 @@ //! [`LayerParams`]: the per-call knob bag that drives the //! canonical deduplication recipe. //! -//! Bundles every per-layer setting the four-step recipe needs +//! Bundles every per-layer setting the five-step recipe needs //! ([`CalibrationMap`], filtering thresholds + allowed kinds, //! [`DeduplicationStrategy`], [`GroupingCriteria`], -//! [`ConflictResolution`]) into a single deserialisable shape -//! callers set once per request. +//! [`SuppressionParams`], [`ConflictResolution`]) into a single +//! deserialisable shape callers set once per request. //! [`LayerPipeline::from_params`] reads it and assembles the -//! four-step pipeline. +//! five-step pipeline. //! //! [`CalibrationMap`]: super::calibrate::CalibrationMap //! [`DeduplicationStrategy`]: super::fuse::DeduplicationStrategy //! [`GroupingCriteria`]: super::fuse::GroupingCriteria +//! [`SuppressionParams`]: super::suppress::SuppressionParams //! [`ConflictResolution`]: super::resolve::ConflictResolution //! [`LayerPipeline::from_params`]: super::pipeline::LayerPipeline::from_params @@ -23,8 +24,9 @@ use serde::{Deserialize, Serialize}; use super::calibrate::CalibrationMap; use super::fuse::{DeduplicationStrategy, GroupingCriteria}; use super::resolve::ConflictResolution; +use super::suppress::SuppressionParams; -/// Configuration for the deduplication pipeline's four-step recipe. +/// Configuration for the deduplication pipeline's five-step recipe. /// /// Owns the sole confidence threshold in the pipeline: detection /// layers and recognizers do not filter on confidence themselves — @@ -56,6 +58,11 @@ pub struct LayerParams { /// group. #[serde(default)] pub strategy: DeduplicationStrategy, + /// Allow-list inputs consumed by [`SuppressionLayer`]. + /// + /// [`SuppressionLayer`]: super::suppress::SuppressionLayer + #[serde(default, skip_serializing_if = "SuppressionParams::is_empty")] + pub suppression: SuppressionParams, /// How to resolve conflicts when different entity kinds overlap /// the same span. #[serde(default)] diff --git a/crates/nvisy-toolkit/src/deduplication/pipeline.rs b/crates/nvisy-toolkit/src/deduplication/pipeline.rs index b431f9d3..35bc7ea5 100644 --- a/crates/nvisy-toolkit/src/deduplication/pipeline.rs +++ b/crates/nvisy-toolkit/src/deduplication/pipeline.rs @@ -7,6 +7,7 @@ use std::marker::PhantomData; +use nvisy_core::Error; use nvisy_core::entity::Entity; use nvisy_core::extraction::TextAt; use nvisy_core::modality::{Modality, Overlap}; @@ -18,6 +19,7 @@ use super::layer::{Layer, LayerContext}; use super::params::LayerParams; use super::resolve::ResolveConflictsLayer; use super::span_size::SpanSize; +use super::suppress::SuppressionLayer; const TARGET: &str = "nvisy_toolkit::deduplication"; @@ -92,17 +94,25 @@ where M::Location: Overlap + SpanSize, R: TextAt + ?Sized, { - /// Build the canonical four-layer recipe: calibrate → filter → - /// fuse → resolve. Every layer's config is read from `params`. - pub fn from_params(params: &LayerParams) -> Self { + /// Build the canonical five-layer recipe: calibrate → filter → + /// fuse → suppress → resolve. Every layer's config is read + /// from `params`. + /// + /// # Errors + /// + /// Returns a validation error when any + /// `params.allow_values_regex` entry fails to compile. + pub fn from_params(params: &LayerParams) -> Result { let filter = FilterLayer::new() .with_allowed_labels(params.allowed_labels.clone()) .with_confidence_threshold(params.confidence_threshold); - Self::new() + let suppress = SuppressionLayer::from_params(¶ms.suppression)?; + Ok(Self::new() .with_layer(CalibrateLayer::new(params.calibration.clone())) .with_layer(filter) .with_layer(FuseLayer::new(params.strategy.clone(), params.grouping)) - .with_layer(ResolveConflictsLayer::new(params.conflict_resolution)) + .with_layer(suppress) + .with_layer(ResolveConflictsLayer::new(params.conflict_resolution))) } } @@ -111,3 +121,76 @@ impl + ?Sized> Default for LayerPipeline { Self::new() } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use async_trait::async_trait; + use nvisy_core::entity::{Entity, builtins}; + use nvisy_core::modality::{Text, TextLocation}; + + use super::*; + use crate::deduplication::suppress::SuppressionParams; + + struct TextSliceResolver(Arc); + + #[async_trait] + impl TextAt for TextSliceResolver { + async fn text_at(&self, location: &TextLocation) -> Option { + self.0.get(location.start..location.end).map(String::from) + } + } + + fn email(start: usize, end: usize) -> Entity { + Entity::test_builder(start, end) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) + .test_build() + } + + fn url(start: usize, end: usize) -> Entity { + Entity::test_builder(start, end) + .with_label(builtins::URL.label_ref()) + .test_build() + } + + /// Pipeline-order contract: fuse collapses same-kind duplicates + /// before suppress sees them; suppress drops allowlisted + /// entities before resolve adjudicates cross-kind conflicts. + /// + /// Inputs: two PERSON_NAME hits at the same span (duplicates + /// of an allowlisted email-like value) plus one URL hit at an + /// overlapping span. After the pipeline, only the URL should + /// survive. + /// + /// Without `fuse → suppress` ordering, the duplicate would + /// survive (suppress only drops one of the two). Without + /// `suppress → resolve` ordering, the resolve step would + /// pick a winner between EMAIL and URL — possibly the EMAIL — + /// before the allow-list could remove it. + #[tokio::test] + async fn fuse_then_suppress_then_resolve() { + let source = "noreply@foo.com /docs"; + let resolver = TextSliceResolver(Arc::new(source.to_owned())); + + let params = LayerParams { + suppression: SuppressionParams::new() + .with_allow_values(vec!["noreply@foo.com".to_owned()]), + ..Default::default() + }; + + let pipeline: LayerPipeline = + LayerPipeline::from_params(¶ms).expect("pipeline builds"); + let ctx = LayerContext::new(&resolver); + + // Two EMAIL hits at [0, 15) — same kind, same span, fuse + // collapses to one. The collapsed entity matches the + // allow-list; suppress drops it. A URL at [0, 21) remains + // for resolve to leave untouched. + let entities = vec![email(0, 15), email(0, 15), url(0, 21)]; + + let survivors = pipeline.run(entities, &ctx).await; + assert_eq!(survivors.len(), 1, "only the URL should survive"); + assert_eq!(survivors[0].label, builtins::URL.label_ref()); + } +} diff --git a/crates/nvisy-toolkit/src/deduplication/suppress/mod.rs b/crates/nvisy-toolkit/src/deduplication/suppress/mod.rs new file mode 100644 index 00000000..71b46a26 --- /dev/null +++ b/crates/nvisy-toolkit/src/deduplication/suppress/mod.rs @@ -0,0 +1,366 @@ +//! [`SuppressionLayer`]: drop entities whose matched text is on a +//! caller-supplied allow list. +//! +//! See [`SuppressionParams`] for the three allow-list shapes. +//! +//! All three operate on the **entity's resolved text** (sliced from +//! the source via [`TextAt::text_at`]), not the surrounding +//! document. When the resolver returns `None` (e.g. malformed +//! location), the entity is kept — better to surface a false +//! positive than silently drop something we can't verify. +//! +//! Returns dropped entities from [`Layer::apply`] so the pipeline +//! can attribute them in its drop-reason roll-up. +//! +//! [`Layer::apply`]: super::layer::Layer::apply +//! [`TextAt::text_at`]: nvisy_core::extraction::TextAt::text_at + +mod params; + +use nvisy_core::Error; +use nvisy_core::entity::Entity; +use nvisy_core::extraction::TextAt; +use nvisy_core::modality::Modality; +use regex::Regex; + +pub use self::params::SuppressionParams; +use super::layer::{Layer, LayerContext}; + +const TARGET: &str = "nvisy_toolkit::deduplication::suppress"; + +/// [`Layer`] that drops entities whose resolved text is on a +/// caller-supplied allow list. +/// +/// Construct via [`SuppressionLayer::new`] (empty, fast no-op) or +/// [`SuppressionLayer::from_params`] (pre-validates the regex +/// inputs). An empty layer short-circuits in +/// [`Layer::apply`] without touching the resolver. +#[derive(Debug, Clone, Default)] +pub struct SuppressionLayer { + /// Pre-lowercased exact-match values. + allow_values: Vec, + /// Pre-lowercased substring values. + allow_values_substring: Vec, + /// Pre-compiled regex patterns. + allow_values_regex: Vec, +} + +impl SuppressionLayer { + /// Empty layer: passes every entity through unchanged. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Construct from a [`SuppressionParams`]. Each regex source + /// is compiled once here. + /// + /// Empty strings are silently dropped at construction from + /// all three lists: each would match every entity (or every + /// position) and is virtually never what the author meant. + /// Treating them as configuration mistakes and ignoring them + /// is safer than wiping every result. + /// + /// # Errors + /// + /// Returns a validation error when any non-empty entry in + /// [`SuppressionParams::allow_values_regex`] is not a valid + /// regular expression. + pub fn from_params(params: &SuppressionParams) -> Result { + let allow_values = params + .allow_values + .iter() + .filter(|v| !v.is_empty()) + .map(|v| v.to_ascii_lowercase()) + .collect(); + let allow_values_substring = params + .allow_values_substring + .iter() + .filter(|v| !v.is_empty()) + .map(|v| v.to_ascii_lowercase()) + .collect(); + let allow_values_regex = params + .allow_values_regex + .iter() + .filter(|src| !src.is_empty()) + .map(|src| { + Regex::new(src).map_err(|e| { + Error::validation( + format!("invalid allow_values_regex `{src}`: {e}"), + "nvisy-toolkit", + ) + }) + }) + .collect::, _>>()?; + Ok(Self { + allow_values, + allow_values_substring, + allow_values_regex, + }) + } + + /// Return `true` when no allow-list values are configured. + #[must_use] + pub fn is_empty(&self) -> bool { + self.allow_values.is_empty() + && self.allow_values_substring.is_empty() + && self.allow_values_regex.is_empty() + } + + /// Return `true` when `text` matches any configured allow-list + /// entry under exact / substring / regex semantics. + #[must_use] + pub fn suppresses(&self, text: &str) -> bool { + let lowered = text.to_ascii_lowercase(); + if self.allow_values.iter().any(|v| v == &lowered) { + return true; + } + if self + .allow_values_substring + .iter() + .any(|v| lowered.contains(v.as_str())) + { + return true; + } + if self.allow_values_regex.iter().any(|r| r.is_match(text)) { + return true; + } + false + } +} + +#[async_trait::async_trait] +impl Layer for SuppressionLayer +where + M: Modality, + R: TextAt + ?Sized, +{ + async fn apply( + &self, + entities: &mut Vec>, + ctx: &LayerContext<'_, M, R>, + ) -> Vec> { + if self.is_empty() || entities.is_empty() { + return Vec::new(); + } + + let mut suppressed_flags = Vec::with_capacity(entities.len()); + for entity in entities.iter() { + let suppress = match ctx.resolver.text_at(&entity.location).await { + Some(text) => self.suppresses(&text), + None => false, + }; + suppressed_flags.push(suppress); + } + + let mut suppressed_count = 0usize; + let mut dropped = Vec::new(); + let mut idx = 0usize; + entities.retain(|entity| { + let drop = suppressed_flags[idx]; + idx += 1; + if drop { + suppressed_count += 1; + dropped.push(entity.clone()); + } + !drop + }); + + if suppressed_count > 0 { + tracing::debug!( + target: TARGET, + suppressed = suppressed_count, + "entities suppressed by allow list", + ); + } + dropped + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use nvisy_core::entity::{Entity, builtins}; + use nvisy_core::modality::{Text, TextLocation}; + + use super::*; + + /// Test resolver that resolves locations to a slice of a known + /// string. The Noop test_resolver in the parent module returns + /// `None`, which is fine for layers that don't touch text but + /// useless here. + struct TextSliceResolver { + text: Arc, + } + + #[async_trait::async_trait] + impl TextAt for TextSliceResolver { + async fn text_at(&self, location: &TextLocation) -> Option { + self.text + .get(location.start..location.end) + .map(String::from) + } + } + + fn entity(start: usize, end: usize) -> Entity { + Entity::test_builder(start, end) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) + .test_build() + } + + fn params(values: &[&str], substrings: &[&str], regexes: &[&str]) -> SuppressionParams { + SuppressionParams { + allow_values: values.iter().map(|s| (*s).to_owned()).collect(), + allow_values_substring: substrings.iter().map(|s| (*s).to_owned()).collect(), + allow_values_regex: regexes.iter().map(|s| (*s).to_owned()).collect(), + } + } + + async fn apply_to( + layer: &SuppressionLayer, + source: &str, + mut entities: Vec>, + ) -> (Vec>, Vec>) { + let resolver = TextSliceResolver { + text: Arc::new(source.to_owned()), + }; + let ctx = LayerContext::new(&resolver); + let dropped = layer.apply(&mut entities, &ctx).await; + (entities, dropped) + } + + #[tokio::test] + async fn empty_layer_is_noop() { + let layer = SuppressionLayer::new(); + let source = "noreply@foo.com matters"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert_eq!(kept.len(), 1); + assert!(dropped.is_empty()); + } + + #[tokio::test] + async fn exact_match_drops_entity() { + let layer = SuppressionLayer::from_params(¶ms(&["noreply@foo.com"], &[], &[])) + .expect("layer builds"); + let source = "noreply@foo.com matters"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert!(kept.is_empty()); + assert_eq!(dropped.len(), 1); + } + + #[tokio::test] + async fn exact_match_is_case_insensitive() { + let layer = SuppressionLayer::from_params(¶ms(&["NoReply@Foo.com"], &[], &[])) + .expect("layer builds"); + let source = "noreply@foo.com matters"; + let (kept, _) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert!(kept.is_empty(), "case-insensitive allow-list should drop"); + } + + #[tokio::test] + async fn exact_match_does_not_drop_partial_overlap() { + // Allow value is a substring of the entity, but not an + // exact equal — exact mode keeps it. + let layer = SuppressionLayer::from_params(¶ms(&["noreply@foo.com"], &[], &[])) + .expect("layer builds"); + let source = "noreply@foo.com support team"; + let (kept, _) = apply_to(&layer, source, vec![entity(0, 28)]).await; + assert_eq!(kept.len(), 1, "exact mode must not drop on partial overlap"); + } + + #[tokio::test] + async fn substring_match_drops_partial_overlap() { + let layer = SuppressionLayer::from_params(¶ms(&[], &["noreply@foo.com"], &[])) + .expect("layer builds"); + let source = "noreply@foo.com support team"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 28)]).await; + assert!(kept.is_empty()); + assert_eq!(dropped.len(), 1); + } + + #[tokio::test] + async fn regex_match_drops_entity() { + let layer = SuppressionLayer::from_params(¶ms(&[], &[], &[r"^test-.*@foo\.com$"])) + .expect("layer builds"); + let source = "test-1234@foo.com"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, source.len())]).await; + assert!(kept.is_empty()); + assert_eq!(dropped.len(), 1); + } + + #[tokio::test] + async fn invalid_regex_at_construction_errors() { + let result = SuppressionLayer::from_params(¶ms(&[], &[], &["["])); + assert!(result.is_err(), "invalid regex must error at construction"); + } + + #[tokio::test] + async fn unresolved_text_keeps_entity() { + // Pass an entity with a location outside the source text. + // text_at returns None, the layer falls open and keeps the + // entity rather than silently dropping it. + let layer = SuppressionLayer::from_params(¶ms(&["noreply@foo.com"], &[], &[])) + .expect("layer builds"); + let source = "short"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(100, 200)]).await; + assert_eq!(kept.len(), 1); + assert!(dropped.is_empty()); + } + + #[tokio::test] + async fn empty_substring_entry_does_not_suppress_everything() { + // `str::contains("")` is always true; without the + // construction-time filter, an empty entry would wipe + // every match. Confirm the filter holds. + let layer = SuppressionLayer::from_params(¶ms(&[], &[""], &[])).expect("layer builds"); + let source = "noreply@foo.com matters"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert_eq!(kept.len(), 1, "empty substring must not drop"); + assert!(dropped.is_empty()); + } + + #[tokio::test] + async fn empty_exact_entry_is_ignored() { + // An empty exact entry could only match an empty entity, + // which recognizers don't emit. Filtering it costs + // nothing and keeps the lookup short. + let layer = SuppressionLayer::from_params(¶ms(&[""], &[], &[])).expect("layer builds"); + let source = "noreply@foo.com matters"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert_eq!(kept.len(), 1); + assert!(dropped.is_empty()); + } + + #[tokio::test] + async fn empty_regex_entry_is_ignored() { + // An empty regex matches at every position. Same + // catastrophe as empty substring; filter at construction. + let layer = SuppressionLayer::from_params(¶ms(&[], &[], &[""])).expect("layer builds"); + let source = "noreply@foo.com matters"; + let (kept, dropped) = apply_to(&layer, source, vec![entity(0, 15)]).await; + assert_eq!(kept.len(), 1, "empty regex must not drop"); + assert!(dropped.is_empty()); + } + + #[tokio::test] + async fn union_across_modes() { + // Three allow-list shapes, three entities. Each entity is + // suppressed by exactly one mode; all three drop. + let layer = SuppressionLayer::from_params(¶ms( + &["alpha@x.com"], + &["bravo"], + &[r"^charlie-\d+$"], + )) + .expect("layer builds"); + let source = "alpha@x.com bravo-team-12 charlie-99"; + let entities = vec![ + entity(0, 11), // exact + entity(12, 25), // substring + entity(26, 36), // regex + ]; + let (kept, dropped) = apply_to(&layer, source, entities).await; + assert!(kept.is_empty(), "all three should be suppressed"); + assert_eq!(dropped.len(), 3); + } +} diff --git a/crates/nvisy-toolkit/src/deduplication/suppress/params.rs b/crates/nvisy-toolkit/src/deduplication/suppress/params.rs new file mode 100644 index 00000000..d229572d --- /dev/null +++ b/crates/nvisy-toolkit/src/deduplication/suppress/params.rs @@ -0,0 +1,82 @@ +//! [`SuppressionParams`]: caller-supplied allow lists consumed by +//! [`SuppressionLayer`]. +//! +//! Three independent allow-list shapes apply by union — an entity +//! is dropped when **any** of them fires: +//! +//! - exact ASCII case-insensitive equality +//! - substring containment +//! - regex match +//! +//! All three operate on the entity's resolved text (sliced from +//! the source via [`TextAt::text_at`]), not the surrounding +//! document. +//! +//! [`SuppressionLayer`]: super::SuppressionLayer +//! [`TextAt::text_at`]: nvisy_core::extraction::TextAt::text_at + +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Caller-supplied allow lists consumed by [`SuppressionLayer`]. +/// +/// All three lists default to empty; the layer short-circuits as +/// a fast no-op when every list is empty. +/// +/// [`SuppressionLayer`]: super::SuppressionLayer +#[derive(Debug, Clone, Default, PartialEq, Eq)] +#[derive(Serialize, Deserialize, JsonSchema)] +pub struct SuppressionParams { + /// Drop entities whose matched text equals one of these values + /// (ASCII case-insensitive). Use for known false-positive + /// values like `noreply@yourcompany.com`. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub allow_values: Vec, + /// Drop entities whose matched text contains one of these + /// values as a substring (ASCII case-insensitive). Use when an + /// over-matching recognizer surrounds a known false-positive + /// value with extra text. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub allow_values_substring: Vec, + /// Drop entities whose matched text matches one of these + /// regular expressions. Compiled once at layer construction. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub allow_values_regex: Vec, +} + +impl SuppressionParams { + /// Empty params: every allow list defaults to empty. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Set the exact-match list. + #[must_use] + pub fn with_allow_values(mut self, values: Vec) -> Self { + self.allow_values = values; + self + } + + /// Set the substring-match list. + #[must_use] + pub fn with_allow_values_substring(mut self, values: Vec) -> Self { + self.allow_values_substring = values; + self + } + + /// Set the regex-match list. + #[must_use] + pub fn with_allow_values_regex(mut self, values: Vec) -> Self { + self.allow_values_regex = values; + self + } + + /// Return `true` when no allow-list values are configured. + #[must_use] + pub fn is_empty(&self) -> bool { + self.allow_values.is_empty() + && self.allow_values_substring.is_empty() + && self.allow_values_regex.is_empty() + } +} diff --git a/crates/nvisy-toolkit/tests/fixtures/pipeline.rs b/crates/nvisy-toolkit/tests/fixtures/pipeline.rs index c3ba3f0e..9963227e 100644 --- a/crates/nvisy-toolkit/tests/fixtures/pipeline.rs +++ b/crates/nvisy-toolkit/tests/fixtures/pipeline.rs @@ -64,6 +64,7 @@ impl Fixture { let ctx = LayerContext::::new(&buffer); let entities = LayerPipeline::::from_params(&dedup_params()) + .expect("pipeline builds") .run(detected, &ctx) .await; @@ -103,6 +104,7 @@ impl Fixture { let ctx = LayerContext::::new(&buffer); let entities = LayerPipeline::::from_params(&dedup_params()) + .expect("pipeline builds") .run(detected, &ctx) .await; From 85940d9ffb2f3a58c27e3059e8d4d0a633140d4c Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 16 Jun 2026 01:37:39 +0200 Subject: [PATCH 09/14] feat(pattern,core): country scope + Presidio-aligned shipped pattern set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - nvisy-core: new `CountryCode` (ISO 3166-1 alpha-2, validated via `celes`). `RecognizerInput.country: Option` + `applies_to_country` mirror the existing language scoping. - nvisy-pattern: `Regex.countries` / `Dictionary.countries` (Vec, empty = world). `PatternRecognizer::recognize` honours per-call jurisdiction hints alongside language hints. - Asset tree reorganized into `world/`, `us/`, `uk/` subtrees; shipped accessors split into per-region modules (`shipped::patterns::{world,us,uk}`, dictionaries::world). Macro helpers exported as `__shipped_pattern` / `__shipped_dictionary` so sub-modules resolve their own include_str! paths. - Pattern count grows 23 → 34: world unchanged at 18; us 5 → 10 (+itin, npi, mbi, bank_account, medical_license); uk added at 6 (nhs, nino, driving_licence, postcode, vehicle_registration, passport). - Validators split into per-country sub-modules with dotted names (`us.ssn`, `us.aba_routing`, `us.npi`, `us.dea_number`, `uk.nhs`, `uk.nino`). Shared `luhn`, `iban`, `phone`, `date` stay flat. World pattern set extended (brand-aware credit_card, RFC5322-loose email, Cisco-form MAC, IPv4 CIDR, comprehensive IPv6 alternation set). - Pattern scores normalized to a single conservative-baseline scheme (most regex-only matches land at 0.1–0.5 before context boost). Confidence threshold in the toolkit test fixture lowered to 0.35 to match. - `assets/NOTICE.md` documents third-party regex provenance for the shipped pattern assets. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 11 ++ Cargo.toml | 1 + crates/nvisy-context/README.md | 4 +- crates/nvisy-context/src/rule.rs | 9 +- crates/nvisy-core/Cargo.toml | 1 + crates/nvisy-core/src/primitive/country.rs | 179 ++++++++++++++++++ crates/nvisy-core/src/primitive/mod.rs | 6 +- crates/nvisy-core/src/recognition/input.rs | 40 +++- crates/nvisy-core/src/recognition/mod.rs | 2 +- .../nvisy-llm/src/recognition/file_prompt.rs | 8 +- .../nvisy-ner/src/recognition/aggregation.rs | 12 +- crates/nvisy-ner/src/recognition/config.rs | 6 +- crates/nvisy-pattern/README.md | 10 +- crates/nvisy-pattern/assets/NOTICE.md | 38 ++++ .../{ => world}/finance/cryptocurrencies.csv | 0 .../{ => world}/finance/cryptocurrencies.toml | 0 .../{ => world}/finance/currencies.csv | 0 .../{ => world}/finance/currencies.toml | 0 .../{general => world/personal}/languages.csv | 0 .../personal}/languages.toml | 0 .../personal}/nationalities.toml | 0 .../personal}/nationalities.txt | 0 .../personal}/religions.toml | 0 .../{general => world/personal}/religions.txt | 0 .../assets/patterns/contact/email.toml | 6 - .../patterns/finance/us_bank_routing.toml | 6 - .../assets/patterns/identity/ssn.toml | 8 - .../patterns/identity/us_drivers_license.toml | 6 - .../assets/patterns/network/ipv4.toml | 6 - .../assets/patterns/network/ipv6.toml | 6 - .../assets/patterns/network/mac_address.toml | 6 - .../assets/patterns/uk/contact/postcode.toml | 23 +++ .../patterns/uk/identity/driving_licence.toml | 24 +++ .../assets/patterns/uk/identity/nhs.toml | 17 ++ .../assets/patterns/uk/identity/nino.toml | 16 ++ .../assets/patterns/uk/identity/passport.toml | 26 +++ .../patterns/uk/vehicle/registration.toml | 39 ++++ .../patterns/us/finance/bank_account.toml | 26 +++ .../patterns/us/finance/bank_routing.toml | 32 ++++ .../assets/patterns/us/health/mbi.toml | 27 +++ .../patterns/us/health/medical_license.toml | 36 ++++ .../assets/patterns/us/health/npi.toml | 28 +++ .../patterns/us/identity/drivers_license.toml | 26 +++ .../assets/patterns/us/identity/itin.toml | 28 +++ .../identity/passport.toml} | 3 +- .../identity/postal_code.toml} | 1 + .../assets/patterns/us/identity/ssn.toml | 14 ++ .../assets/patterns/world/contact/email.toml | 12 ++ .../patterns/{ => world}/contact/phone.toml | 2 +- .../patterns/{ => world}/contact/url.toml | 2 +- .../{ => world}/credentials/aws_key.toml | 0 .../credentials/generic_api_key.toml | 2 +- .../{ => world}/credentials/github_token.toml | 0 .../{ => world}/credentials/private_key.toml | 0 .../{ => world}/credentials/stripe_key.toml | 0 .../{ => world}/finance/bitcoin_address.toml | 2 +- .../{ => world}/finance/credit_card.toml | 10 +- .../{ => world}/finance/ethereum_address.toml | 2 +- .../patterns/{ => world}/finance/iban.toml | 2 +- .../{ => world}/finance/swift_code.toml | 2 +- .../assets/patterns/world/network/ipv4.toml | 8 + .../assets/patterns/world/network/ipv6.toml | 25 +++ .../patterns/world/network/mac_address.toml | 16 ++ .../{ => world}/personal/date_of_birth.toml | 2 +- .../{ => world}/personal/datetime.toml | 2 +- .../nvisy-pattern/src/recognition/compiled.rs | 8 +- .../src/recognition/dictionary.rs | 18 +- .../src/recognition/recognizer.rs | 150 ++++++++++++++- crates/nvisy-pattern/src/recognition/regex.rs | 20 +- .../nvisy-pattern/src/shipped/dictionaries.rs | 102 ---------- .../src/shipped/dictionaries/mod.rs | 77 ++++++++ .../src/shipped/dictionaries/world.rs | 50 +++++ crates/nvisy-pattern/src/shipped/patterns.rs | 162 ---------------- .../nvisy-pattern/src/shipped/patterns/mod.rs | 108 +++++++++++ .../nvisy-pattern/src/shipped/patterns/uk.rs | 46 +++++ .../nvisy-pattern/src/shipped/patterns/us.rs | 64 +++++++ .../src/shipped/patterns/world.rs | 106 +++++++++++ crates/nvisy-pattern/src/validators/mod.rs | 34 +++- crates/nvisy-pattern/src/validators/uk/mod.rs | 12 ++ crates/nvisy-pattern/src/validators/uk/nhs.rs | 67 +++++++ .../nvisy-pattern/src/validators/uk/nino.rs | 57 ++++++ .../src/validators/us/aba_routing.rs | 64 +++++++ .../src/validators/us/dea_number.rs | 83 ++++++++ crates/nvisy-pattern/src/validators/us/mod.rs | 16 ++ crates/nvisy-pattern/src/validators/us/npi.rs | 66 +++++++ .../src/validators/{ => us}/ssn.rs | 8 +- .../testdata/inputs/identity.txt | 3 + crates/nvisy-pattern/testdata/inputs/uk.txt | 12 ++ .../nvisy-pattern/tests/shipped_detection.rs | 53 ++++++ .../tests/fixtures/registries.rs | 11 +- 90 files changed, 1848 insertions(+), 375 deletions(-) create mode 100644 crates/nvisy-core/src/primitive/country.rs create mode 100644 crates/nvisy-pattern/assets/NOTICE.md rename crates/nvisy-pattern/assets/dictionaries/{ => world}/finance/cryptocurrencies.csv (100%) rename crates/nvisy-pattern/assets/dictionaries/{ => world}/finance/cryptocurrencies.toml (100%) rename crates/nvisy-pattern/assets/dictionaries/{ => world}/finance/currencies.csv (100%) rename crates/nvisy-pattern/assets/dictionaries/{ => world}/finance/currencies.toml (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/languages.csv (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/languages.toml (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/nationalities.toml (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/nationalities.txt (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/religions.toml (100%) rename crates/nvisy-pattern/assets/dictionaries/{general => world/personal}/religions.txt (100%) delete mode 100644 crates/nvisy-pattern/assets/patterns/contact/email.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/identity/ssn.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/network/ipv4.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/network/ipv6.toml delete mode 100644 crates/nvisy-pattern/assets/patterns/network/mac_address.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/contact/postcode.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/identity/nhs.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/identity/nino.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/identity/passport.toml create mode 100644 crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/finance/bank_account.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/finance/bank_routing.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/health/mbi.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/health/npi.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/identity/drivers_license.toml create mode 100644 crates/nvisy-pattern/assets/patterns/us/identity/itin.toml rename crates/nvisy-pattern/assets/patterns/{identity/us_passport.toml => us/identity/passport.toml} (74%) rename crates/nvisy-pattern/assets/patterns/{identity/us_postal_code.toml => us/identity/postal_code.toml} (84%) create mode 100644 crates/nvisy-pattern/assets/patterns/us/identity/ssn.toml create mode 100644 crates/nvisy-pattern/assets/patterns/world/contact/email.toml rename crates/nvisy-pattern/assets/patterns/{ => world}/contact/phone.toml (97%) rename crates/nvisy-pattern/assets/patterns/{ => world}/contact/url.toml (87%) rename crates/nvisy-pattern/assets/patterns/{ => world}/credentials/aws_key.toml (100%) rename crates/nvisy-pattern/assets/patterns/{ => world}/credentials/generic_api_key.toml (94%) rename crates/nvisy-pattern/assets/patterns/{ => world}/credentials/github_token.toml (100%) rename crates/nvisy-pattern/assets/patterns/{ => world}/credentials/private_key.toml (100%) rename crates/nvisy-pattern/assets/patterns/{ => world}/credentials/stripe_key.toml (100%) rename crates/nvisy-pattern/assets/patterns/{ => world}/finance/bitcoin_address.toml (91%) rename crates/nvisy-pattern/assets/patterns/{ => world}/finance/credit_card.toml (55%) rename crates/nvisy-pattern/assets/patterns/{ => world}/finance/ethereum_address.toml (88%) rename crates/nvisy-pattern/assets/patterns/{ => world}/finance/iban.toml (93%) rename crates/nvisy-pattern/assets/patterns/{ => world}/finance/swift_code.toml (90%) create mode 100644 crates/nvisy-pattern/assets/patterns/world/network/ipv4.toml create mode 100644 crates/nvisy-pattern/assets/patterns/world/network/ipv6.toml create mode 100644 crates/nvisy-pattern/assets/patterns/world/network/mac_address.toml rename crates/nvisy-pattern/assets/patterns/{ => world}/personal/date_of_birth.toml (97%) rename crates/nvisy-pattern/assets/patterns/{ => world}/personal/datetime.toml (98%) delete mode 100644 crates/nvisy-pattern/src/shipped/dictionaries.rs create mode 100644 crates/nvisy-pattern/src/shipped/dictionaries/mod.rs create mode 100644 crates/nvisy-pattern/src/shipped/dictionaries/world.rs delete mode 100644 crates/nvisy-pattern/src/shipped/patterns.rs create mode 100644 crates/nvisy-pattern/src/shipped/patterns/mod.rs create mode 100644 crates/nvisy-pattern/src/shipped/patterns/uk.rs create mode 100644 crates/nvisy-pattern/src/shipped/patterns/us.rs create mode 100644 crates/nvisy-pattern/src/shipped/patterns/world.rs create mode 100644 crates/nvisy-pattern/src/validators/uk/mod.rs create mode 100644 crates/nvisy-pattern/src/validators/uk/nhs.rs create mode 100644 crates/nvisy-pattern/src/validators/uk/nino.rs create mode 100644 crates/nvisy-pattern/src/validators/us/aba_routing.rs create mode 100644 crates/nvisy-pattern/src/validators/us/dea_number.rs create mode 100644 crates/nvisy-pattern/src/validators/us/mod.rs create mode 100644 crates/nvisy-pattern/src/validators/us/npi.rs rename crates/nvisy-pattern/src/validators/{ => us}/ssn.rs (89%) create mode 100644 crates/nvisy-pattern/testdata/inputs/uk.txt diff --git a/Cargo.lock b/Cargo.lock index d2403490..f987251d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -590,6 +590,16 @@ dependencies = [ "shlex", ] +[[package]] +name = "celes" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55028d5b1eebb35237512a3838ce5583211434a233c8bb179551a7197ffb7bd4" +dependencies = [ + "phf", + "serde", +] + [[package]] name = "cfb" version = "0.7.3" @@ -2906,6 +2916,7 @@ version = "0.1.0" dependencies = [ "async-trait", "bytes", + "celes", "derive_builder", "derive_more", "hipstr", diff --git a/Cargo.toml b/Cargo.toml index 0e9387ec..77a0822a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -71,6 +71,7 @@ hipstr = { version = "0.8", features = ["serde"] } jiff = { version = "0.2", features = ["serde"] } semver = { version = "1.0", features = ["serde"] } oxilangtag = { version = "0.1", features = ["serde"] } +celes = { version = "2.8", features = [] } humantime = { version = "2.1", features = [] } humantime-serde = { version = "1.1", features = [] } type-map = { version = "0.5", features = [] } diff --git a/crates/nvisy-context/README.md b/crates/nvisy-context/README.md index e8653b7a..cf59218a 100644 --- a/crates/nvisy-context/README.md +++ b/crates/nvisy-context/README.md @@ -6,8 +6,8 @@ Post-recognition keyword-boost enhancer for the Nvisy runtime. ## Overview -Mirrors Presidio's `ContextAwareEnhancer` pattern. Every recognizer -that wants score boosting declares a `Context` (a list of keywords +Context-aware confidence boosting. Every recognizer that wants +score boosting declares a `Context` (a list of keywords plus optional window / boost overrides), registered against the recognizer's name. After recognition, `ContextEnhancer` walks each detected `Entity`, looks the recognizer name up in the diff --git a/crates/nvisy-context/src/rule.rs b/crates/nvisy-context/src/rule.rs index f45c423a..da3c809d 100644 --- a/crates/nvisy-context/src/rule.rs +++ b/crates/nvisy-context/src/rule.rs @@ -23,19 +23,16 @@ use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; /// Default window radius in words *before* an entity match. -/// Mirrors Presidio's `context_prefix_count = 5`. pub const DEFAULT_PREFIX_WORDS: usize = 5; /// Default window radius in words *after* an entity match. Set /// equal to [`DEFAULT_PREFIX_WORDS`] so trailing context like /// "123-45-6789 (social security)" boosts the same as leading -/// context. Presidio defaults `context_suffix_count` to `0`; we -/// pick symmetric defaults because operators rarely realize the -/// asymmetry exists, and one-sided windows surprise people. +/// context. Asymmetric windows surprise operators who rarely +/// realize the asymmetry exists, so we pick symmetric defaults. pub const DEFAULT_SUFFIX_WORDS: usize = 5; -/// Default additive boost applied when a keyword fires. Matches -/// Presidio's `context_similarity_factor = 0.35`. +/// Default additive boost applied when a keyword fires. pub const DEFAULT_BOOST: f64 = 0.35; /// Per-label boost rule the [`Enhancer`] applies at runtime. diff --git a/crates/nvisy-core/Cargo.toml b/crates/nvisy-core/Cargo.toml index e815c4ca..3d9e4c03 100644 --- a/crates/nvisy-core/Cargo.toml +++ b/crates/nvisy-core/Cargo.toml @@ -45,6 +45,7 @@ uuid = { workspace = true, features = [] } bytes = { workspace = true, features = [] } hipstr = { workspace = true, features = [] } oxilangtag = { workspace = true, features = [] } +celes = { workspace = true, features = [] } type-map = { workspace = true, features = [] } # Async runtime and parallelism diff --git a/crates/nvisy-core/src/primitive/country.rs b/crates/nvisy-core/src/primitive/country.rs new file mode 100644 index 00000000..6fc9fca0 --- /dev/null +++ b/crates/nvisy-core/src/primitive/country.rs @@ -0,0 +1,179 @@ +//! [ISO 3166-1 alpha-2] country code type. +//! +//! Thin wrapper around [`celes::Country`] that exposes only the +//! alpha-2 surface — alpha-3, numeric, and long-name forms are +//! reachable via [`CountryCode::into_inner`] for the rare consumer +//! that needs them. +//! +//! [ISO 3166-1 alpha-2]: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 + +use std::fmt; +use std::str::FromStr; + +use serde::{Deserialize, Serialize}; + +use crate::Error; + +/// A validated [ISO 3166-1 alpha-2] country code. +/// +/// Two-letter uppercase code identifying a country or region. +/// Construction accepts any case (`"us"`, `"US"`, `"uS"`) and +/// rejects anything that isn't a known ISO 3166-1 alpha-2 code. +/// +/// # Examples +/// +/// ``` +/// use nvisy_core::primitive::CountryCode; +/// +/// let us = CountryCode::new("us").unwrap(); +/// assert_eq!(us.as_str(), "US"); +/// +/// assert!(CountryCode::new("USA").is_err()); +/// assert!(CountryCode::new("XZ").is_err()); +/// ``` +/// +/// [ISO 3166-1 alpha-2]: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2 +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize)] +#[serde(try_from = "String", into = "String")] +pub struct CountryCode(celes::Country); + +impl CountryCode { + /// Parse and validate a country code. + /// + /// Input is case-insensitive; the canonical form returned by + /// [`as_str`] is uppercase. + /// + /// # Errors + /// + /// Returns a validation error when `code` is not a known + /// ISO 3166-1 alpha-2 code. + /// + /// [`as_str`]: Self::as_str + pub fn new(code: &str) -> Result { + celes::Country::from_alpha2(code).map(Self).map_err(|_| { + Error::validation( + format!("country code `{code}` is not a known ISO 3166-1 alpha-2 code"), + "nvisy-core", + ) + }) + } + + /// Return the canonical (uppercase) alpha-2 representation. + #[must_use] + pub fn as_str(&self) -> &'static str { + self.0.alpha2 + } + + /// Borrow the underlying [`celes::Country`] for callers that + /// need alpha-3, numeric, or the country's long name. + #[must_use] + pub fn into_inner(self) -> celes::Country { + self.0 + } +} + +impl fmt::Display for CountryCode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for CountryCode { + type Err = Error; + + fn from_str(s: &str) -> Result { + Self::new(s) + } +} + +impl TryFrom for CountryCode { + type Error = Error; + + fn try_from(value: String) -> Result { + Self::new(&value) + } +} + +impl From for String { + fn from(code: CountryCode) -> Self { + code.as_str().to_owned() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_uppercase_alpha2() { + let us = CountryCode::new("US").unwrap(); + assert_eq!(us.as_str(), "US"); + } + + #[test] + fn accepts_lowercase_alpha2_and_normalises() { + let us = CountryCode::new("us").unwrap(); + assert_eq!(us.as_str(), "US"); + } + + #[test] + fn accepts_mixed_case() { + let gb = CountryCode::new("Gb").unwrap(); + assert_eq!(gb.as_str(), "GB"); + } + + #[test] + fn rejects_alpha3_code() { + assert!(CountryCode::new("USA").is_err()); + assert!(CountryCode::new("GBR").is_err()); + } + + #[test] + fn rejects_unassigned_two_letter_code() { + assert!(CountryCode::new("XZ").is_err()); + assert!(CountryCode::new("ZZ").is_err()); + } + + #[test] + fn rejects_wrong_length() { + assert!(CountryCode::new("U").is_err()); + assert!(CountryCode::new("").is_err()); + assert!(CountryCode::new("USAB").is_err()); + } + + #[test] + fn rejects_non_alpha() { + assert!(CountryCode::new("U1").is_err()); + assert!(CountryCode::new("00").is_err()); + } + + #[test] + fn equality_is_canonical() { + // `us` and `US` should compare equal once parsed. + let lower = CountryCode::new("us").unwrap(); + let upper = CountryCode::new("US").unwrap(); + assert_eq!(lower, upper); + } + + #[test] + fn serde_roundtrip_uppercase() { + let us = CountryCode::new("us").unwrap(); + let json = serde_json::to_string(&us).unwrap(); + assert_eq!(json, "\"US\""); + let back: CountryCode = serde_json::from_str(&json).unwrap(); + assert_eq!(back, us); + } + + #[test] + fn from_str_parses_alpha2() { + let de: CountryCode = "DE".parse().unwrap(); + assert_eq!(de.as_str(), "DE"); + } + + #[test] + fn display_writes_alpha2() { + let fr = CountryCode::new("FR").unwrap(); + assert_eq!(format!("{fr}"), "FR"); + } +} diff --git a/crates/nvisy-core/src/primitive/mod.rs b/crates/nvisy-core/src/primitive/mod.rs index e2fb3927..1744a0b7 100644 --- a/crates/nvisy-core/src/primitive/mod.rs +++ b/crates/nvisy-core/src/primitive/mod.rs @@ -8,16 +8,18 @@ //! ([`LanguageTag`], [`LanguageDetection`]). //! - `rendering` — output-side knobs ([`Color`], [`Dpi`]). //! -//! [`TimeSpan`] is the single root-level primitive — temporal -//! intervals don't have any companion types worth grouping yet. +//! Root-level primitives: [`CountryCode`] (ISO 3166-1 alpha-2), +//! [`TimeSpan`] (temporal intervals). mod confidence; +mod country; mod geometry; mod language; mod rendering; mod time_span; pub use self::confidence::{Confidence, ConfidenceThreshold}; +pub use self::country::CountryCode; pub use self::geometry::{ BoundingBox, Dimensions, IBoundingBox, NormalizedBoundingBox, Polygon, Vertex, }; diff --git a/crates/nvisy-core/src/recognition/input.rs b/crates/nvisy-core/src/recognition/input.rs index 0192878c..2bbc57ba 100644 --- a/crates/nvisy-core/src/recognition/input.rs +++ b/crates/nvisy-core/src/recognition/input.rs @@ -20,7 +20,7 @@ use uuid::Uuid; use super::Hint; use crate::extraction::Artifacts; use crate::modality::Modality; -use crate::primitive::LanguageTag; +use crate::primitive::{CountryCode, LanguageTag}; /// Per-call input for an [`EntityRecognizer`]. /// @@ -48,6 +48,12 @@ pub struct RecognizerInput { /// /// [`language`]: Self::language pub candidate_languages: Vec, + /// Caller-asserted jurisdiction. When `Some`, recognizers + /// that carry per-rule `countries` scopes skip rules that + /// don't match. `None` means "any" — rules that declare + /// countries still run as a permissive fallback so callers + /// who don't pass a hint don't lose detections. + pub country: Option, /// Uploader-supplied hint regions in modality-native coordinates. /// Recognizers that support hint adjudication (LLM-based NER, VLM) /// read this; recognizers that don't (pattern, dictionary) ignore @@ -73,6 +79,7 @@ impl RecognizerInput { artifacts: Artifacts::new(), language: None, candidate_languages: Vec::new(), + country: None, hints: Vec::new(), labels: Vec::new(), correlation_id: None, @@ -100,6 +107,13 @@ impl RecognizerInput { self } + /// Set the asserted jurisdiction. + #[must_use] + pub fn with_country(mut self, country: CountryCode) -> Self { + self.country = Some(country); + self + } + /// Attach uploader-supplied hint regions. #[must_use] pub fn with_hints(mut self, hints: Vec>) -> Self { @@ -144,4 +158,28 @@ impl RecognizerInput { None => true, } } + + /// Whether a recognizer rule scoped to `allowed` countries + /// should run for this call. + /// + /// - An empty `allowed` list means the rule is jurisdiction- + /// agnostic and always runs. + /// - When `allowed` is non-empty and [`country`] is `Some(_)`, + /// the rule runs only when the hint is in `allowed`. + /// - When [`country`] is `None`, the rule still runs — we + /// can't disprove applicability without a hint, and + /// silently dropping detections would surprise callers + /// who simply forgot to set the field. + /// + /// [`country`]: Self::country + #[must_use] + pub fn applies_to_country(&self, allowed: &[CountryCode]) -> bool { + if allowed.is_empty() { + return true; + } + match self.country.as_ref() { + Some(hint) => allowed.iter().any(|a| a == hint), + None => true, + } + } } diff --git a/crates/nvisy-core/src/recognition/mod.rs b/crates/nvisy-core/src/recognition/mod.rs index cd111f0a..25786aaf 100644 --- a/crates/nvisy-core/src/recognition/mod.rs +++ b/crates/nvisy-core/src/recognition/mod.rs @@ -1,4 +1,4 @@ -//! [`EntityRecognizer`]: the Presidio-style entity-detection trait. +//! [`EntityRecognizer`]: the entity-detection trait. //! //! Every detector that emits [`Entity`] for some modality `M` //! implements this trait — pattern recognizers, NER bento clients, diff --git a/crates/nvisy-llm/src/recognition/file_prompt.rs b/crates/nvisy-llm/src/recognition/file_prompt.rs index 510694da..ff153840 100644 --- a/crates/nvisy-llm/src/recognition/file_prompt.rs +++ b/crates/nvisy-llm/src/recognition/file_prompt.rs @@ -1,9 +1,9 @@ //! [`FilePrompt`]: load a [`Prompt`] from a TOML file. //! -//! Mirrors Presidio's prompt-as-data model: the user-prompt template -//! plus the label map plus the labels-to-ignore set all live in a -//! single TOML file. Users swap behaviour by editing the file, not -//! by writing Rust. Templates use Jinja2 syntax via `minijinja`. +//! Prompt-as-data shape: the user-prompt template plus the label +//! map plus the labels-to-ignore set all live in a single TOML +//! file. Users swap behaviour by editing the file, not by writing +//! Rust. Templates use Jinja2 syntax via `minijinja`. //! //! # TOML schema //! diff --git a/crates/nvisy-ner/src/recognition/aggregation.rs b/crates/nvisy-ner/src/recognition/aggregation.rs index e79fb791..7e5f5993 100644 --- a/crates/nvisy-ner/src/recognition/aggregation.rs +++ b/crates/nvisy-ner/src/recognition/aggregation.rs @@ -1,13 +1,11 @@ //! [`AggregationStrategy`] and [`AlignmentMode`]: policies for //! collapsing per-token NER predictions into entity spans. //! -//! Mirror the equivalent knobs on Presidio's -//! `NerModelConfiguration`. The producer engine may apply them -//! server-side (the Bento `inference-gliner` already returns -//! aggregated spans), in which case the consumer-side knobs are -//! advisory; or the producer may emit unaggregated -//! token-classification output, in which case [`NerRecognizer`] -//! applies them itself. +//! The producer engine may apply them server-side (the Bento +//! `inference-gliner` already returns aggregated spans), in which +//! case the consumer-side knobs are advisory; or the producer may +//! emit unaggregated token-classification output, in which case +//! [`NerRecognizer`] applies them itself. //! //! [`NerRecognizer`]: super::NerRecognizer diff --git a/crates/nvisy-ner/src/recognition/config.rs b/crates/nvisy-ner/src/recognition/config.rs index c8af8c87..e8216d00 100644 --- a/crates/nvisy-ner/src/recognition/config.rs +++ b/crates/nvisy-ner/src/recognition/config.rs @@ -1,8 +1,8 @@ //! [`NerModel`]: client-side NER tuning knobs. //! -//! Mirrors Presidio's `NerModelConfiguration`. Applied inside -//! [`NerRecognizer`] before entities are emitted, so backends stay -//! dumb and label normalization is uniform across them. +//! Applied inside [`NerRecognizer`] before entities are emitted, +//! so backends stay dumb and label normalization is uniform +//! across them. //! //! Construct via [`NerModel::default`] for the canonical defaults //! (canonical label map, no ignored labels, score = 0.85, no diff --git a/crates/nvisy-pattern/README.md b/crates/nvisy-pattern/README.md index f39a99e2..fa8acd75 100644 --- a/crates/nvisy-pattern/README.md +++ b/crates/nvisy-pattern/README.md @@ -8,11 +8,11 @@ Nvisy runtime. ## Overview `PatternRecognizer` compiles a set of `Regex` rules (each holding -one or more regex `Variant`s, a Presidio-shaped multi-strategy -group) and `Dictionary` term lists into pooled scanners — one -shared `regex::RegexSet` for the regex side and one shared -`aho_corasick::AhoCorasick` automaton for the literal side. A -single walk over the input runs both scanners and emits +one or more regex `Variant`s grouped as a multi-strategy detector +for one entity type) and `Dictionary` term lists into pooled +scanners — one shared `regex::RegexSet` for the regex side and +one shared `aho_corasick::AhoCorasick` automaton for the literal +side. A single walk over the input runs both scanners and emits `Entity` values in modality-local byte coordinates. Rules may declare per-label context keywords. Calling diff --git a/crates/nvisy-pattern/assets/NOTICE.md b/crates/nvisy-pattern/assets/NOTICE.md new file mode 100644 index 00000000..161792e5 --- /dev/null +++ b/crates/nvisy-pattern/assets/NOTICE.md @@ -0,0 +1,38 @@ +# Third-party attribution: shipped pattern assets + +Several shipped pattern TOMLs under this directory carry regular +expressions adapted from [Microsoft Presidio][presidio] +(`microsoft/presidio`, MIT licensed), specifically the +`presidio-analyzer/presidio_analyzer/predefined_recognizers/` +classes referenced inline in each TOML's leading comment. + +The Presidio MIT license text is reproduced below, per its +`Permission notice` clause. + +[presidio]: https://github.com/microsoft/presidio + +--- + +``` +MIT License + +Copyright (c) Microsoft Corporation. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +``` diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.csv b/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.csv similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.csv rename to crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.csv diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml b/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml rename to crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/currencies.csv b/crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.csv similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/finance/currencies.csv rename to crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.csv diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml b/crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.toml similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml rename to crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.toml diff --git a/crates/nvisy-pattern/assets/dictionaries/general/languages.csv b/crates/nvisy-pattern/assets/dictionaries/world/personal/languages.csv similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/languages.csv rename to crates/nvisy-pattern/assets/dictionaries/world/personal/languages.csv diff --git a/crates/nvisy-pattern/assets/dictionaries/general/languages.toml b/crates/nvisy-pattern/assets/dictionaries/world/personal/languages.toml similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/languages.toml rename to crates/nvisy-pattern/assets/dictionaries/world/personal/languages.toml diff --git a/crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml b/crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.toml similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml rename to crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.toml diff --git a/crates/nvisy-pattern/assets/dictionaries/general/nationalities.txt b/crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.txt similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/nationalities.txt rename to crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.txt diff --git a/crates/nvisy-pattern/assets/dictionaries/general/religions.toml b/crates/nvisy-pattern/assets/dictionaries/world/personal/religions.toml similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/religions.toml rename to crates/nvisy-pattern/assets/dictionaries/world/personal/religions.toml diff --git a/crates/nvisy-pattern/assets/dictionaries/general/religions.txt b/crates/nvisy-pattern/assets/dictionaries/world/personal/religions.txt similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/general/religions.txt rename to crates/nvisy-pattern/assets/dictionaries/world/personal/religions.txt diff --git a/crates/nvisy-pattern/assets/patterns/contact/email.toml b/crates/nvisy-pattern/assets/patterns/contact/email.toml deleted file mode 100644 index fb37ff45..00000000 --- a/crates/nvisy-pattern/assets/patterns/contact/email.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "email" -label = "email_address" - -[[variants]] -regex = "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b" -score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml b/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml deleted file mode 100644 index 12010716..00000000 --- a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "us-bank-routing" -label = "bank_routing" - -[[variants]] -regex = "\\b(?:0[1-9]|[12]\\d|3[0-2])\\d{7}\\b" -score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml b/crates/nvisy-pattern/assets/patterns/identity/ssn.toml deleted file mode 100644 index f2076b26..00000000 --- a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml +++ /dev/null @@ -1,8 +0,0 @@ -name = "ssn" -label = "government_id" -context = ["social security", "ssn", "tax id", "taxpayer identification"] - -[[variants]] -regex = "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b" -score = 0.9 -validator = "ssn" diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml b/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml deleted file mode 100644 index 873af318..00000000 --- a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "us-drivers-license" -label = "drivers_license" - -[[variants]] -regex = "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b" -score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml b/crates/nvisy-pattern/assets/patterns/network/ipv4.toml deleted file mode 100644 index d64403dd..00000000 --- a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "ipv4" -label = "ip_address" - -[[variants]] -regex = "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b" -score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml b/crates/nvisy-pattern/assets/patterns/network/ipv6.toml deleted file mode 100644 index dfc12ecd..00000000 --- a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "ipv6" -label = "ip_address" - -[[variants]] -regex = "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b" -score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml b/crates/nvisy-pattern/assets/patterns/network/mac_address.toml deleted file mode 100644 index fcca5944..00000000 --- a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml +++ /dev/null @@ -1,6 +0,0 @@ -name = "mac-address" -label = "mac_address" - -[[variants]] -regex = "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b" -score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/uk/contact/postcode.toml b/crates/nvisy-pattern/assets/patterns/uk/contact/postcode.toml new file mode 100644 index 00000000..8c0db110 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/contact/postcode.toml @@ -0,0 +1,23 @@ +# UK Postcode. +# +# Reference: https://en.wikipedia.org/wiki/Postcodes_in_the_United_Kingdom + +name = "uk-postcode" +label = "postal_code" +countries = ["GB"] +languages = ["en"] +context = [ + "postcode", + "post code", + "postal code", + "zip", + "address", + "delivery", + "mailing", + "shipping", + "correspondence", +] + +[[variants]] +regex = '\b(GIR\s?0AA|[A-PR-UWYZ][0-9][ABCDEFGHJKPSTUW]?\s?[0-9][ABD-HJLNP-UW-Z]{2}|[A-PR-UWYZ][0-9]{2}\s?[0-9][ABD-HJLNP-UW-Z]{2}|[A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRVWXY]?\s?[0-9][ABD-HJLNP-UW-Z]{2}|[A-PR-UWYZ][A-HK-Y][0-9]{2}\s?[0-9][ABD-HJLNP-UW-Z]{2})\b' +score = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml b/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml new file mode 100644 index 00000000..47ec45b8 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml @@ -0,0 +1,24 @@ +# UK Driving Licence Number (DVLA). +# +# Format: 16-char alphanumeric — 5-char surname (letters padded +# with 9s) + decade digit + month (01-12 or 51-62 for female) + +# day + year digit + 2 initial chars (or 9) + check + 2 letters. + +name = "uk-driving-licence" +label = "drivers_license" +countries = ["GB"] +languages = ["en"] +context = [ + "driving licence", + "driving license", + "driver's licence", + "driver's license", + "dvla", + "dl number", + "licence number", + "license number", +] + +[[variants]] +regex = '\b[A-Z9]{5}[0-9](?:0[1-9]|1[0-2]|5[1-9]|6[0-2])(?:0[1-9]|[12][0-9]|3[01])[0-9][A-Z9]{2}[A-Z0-9][A-Z]{2}\b' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/uk/identity/nhs.toml b/crates/nvisy-pattern/assets/patterns/uk/identity/nhs.toml new file mode 100644 index 00000000..8c39ee2e --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/identity/nhs.toml @@ -0,0 +1,17 @@ +# UK National Health Service number. + +name = "uk-nhs" +label = "medical_id" +countries = ["GB"] +languages = ["en"] +context = [ + "nhs", + "national health service", + "health services authority", + "health authority", +] + +[[variants]] +regex = '\b([0-9]{3})[- ]?([0-9]{3})[- ]?([0-9]{4})\b' +score = 0.5 +validator = "uk.nhs" diff --git a/crates/nvisy-pattern/assets/patterns/uk/identity/nino.toml b/crates/nvisy-pattern/assets/patterns/uk/identity/nino.toml new file mode 100644 index 00000000..8de7d04e --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/identity/nino.toml @@ -0,0 +1,16 @@ +# UK National Insurance Number. +# +# Rust's `regex` crate doesn't support look-around, so the +# reserved-prefix exclusions live in the `uk.nino` validator +# rather than the regex. + +name = "uk-nino" +label = "national_insurance_number" +countries = ["GB"] +languages = ["en"] +context = ["national insurance", "ni number", "nino"] + +[[variants]] +regex = '\b([a-ceghj-pr-tw-zA-CEGHJ-PR-TW-Z][a-ceghj-npr-tw-zA-CEGHJ-NPR-TW-Z]) ?([0-9]{2}) ?([0-9]{2}) ?([0-9]{2}) ?([a-dA-D])\b' +score = 0.5 +validator = "uk.nino" diff --git a/crates/nvisy-pattern/assets/patterns/uk/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/uk/identity/passport.toml new file mode 100644 index 00000000..03b8b9a4 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/identity/passport.toml @@ -0,0 +1,26 @@ +# UK Passport Number (post-2015 format). +# +# 2 letters + 7 digits. Same shape as several other country +# passports — without context-keyword boost the FP rate is high. +# Low score by default so callers rely on context to lift +# legitimate matches. + +name = "uk-passport" +label = "passport_number" +countries = ["GB"] +languages = ["en"] +context = [ + "passport", + "passport number", + "travel document", + "uk passport", + "british passport", + "her majesty", + "his majesty", + "hm passport", + "hmpo", +] + +[[variants]] +regex = '\b[A-Z]{2}\d{7}\b' +score = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml b/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml new file mode 100644 index 00000000..0d585be7 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml @@ -0,0 +1,39 @@ +# UK Vehicle Registration Number. +# +# Covers three eras still seen on the road: +# - Current (2001+): 2 area letters + 2-digit age + 3 random +# letters, e.g. "AB51 ABC" +# - Prefix (1983-2001): year letter + 1-3 digits + 3 letters +# - Suffix (1963-1983): 3 letters + 1-3 digits + year letter + +name = "uk-vehicle-registration" +label = "license_plate" +countries = ["GB"] +languages = ["en"] +context = [ + "vehicle", + "registration", + "number plate", + "licence plate", + "license plate", + "reg", + "vrn", + "dvla", + "v5c", + "logbook", + "mot", + "car", + "insured vehicle", +] + +[[variants]] +regex = '\b[A-HJ-PR-Y][A-HJ-PR-Y](?:0[1-9]|[1-7][0-9])[- ]?[A-HJ-PR-Z]{3}\b' +score = 0.3 + +[[variants]] +regex = '\b[A-HJ-NPR-TV-Y]\d{1,3}[- ]?[A-HJ-PR-Y][A-HJ-PR-Z]{2}\b' +score = 0.2 + +[[variants]] +regex = '\b[A-HJ-PR-Z]{3}[- ]?\d{1,3}[- ]?[A-HJ-NPR-TV-Y]\b' +score = 0.15 diff --git a/crates/nvisy-pattern/assets/patterns/us/finance/bank_account.toml b/crates/nvisy-pattern/assets/patterns/us/finance/bank_account.toml new file mode 100644 index 00000000..03dd30ef --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/finance/bank_account.toml @@ -0,0 +1,26 @@ +# US Bank Account Number. +# +# 8-17 digit run with no checksum — usable only with strong +# surrounding context. The very-weak score relies on the +# context-keyword boost to lift legitimate matches above +# downstream confidence thresholds. + +name = "us-bank-account" +label = "bank_account" +countries = ["US"] +languages = ["en"] +context = [ + "check", + "checking account", + "account", + "account#", + "acct", + "bank", + "save", + "savings", + "debit", +] + +[[variants]] +regex = '\b\d{8,17}\b' +score = 0.05 diff --git a/crates/nvisy-pattern/assets/patterns/us/finance/bank_routing.toml b/crates/nvisy-pattern/assets/patterns/us/finance/bank_routing.toml new file mode 100644 index 00000000..687512ae --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/finance/bank_routing.toml @@ -0,0 +1,32 @@ +# US ABA Routing Number. +# +# Two variants: +# - 9-digit unbroken form, prefix-validated to the Fed routing +# ranges (01-12, 21-32) and checksum-validated by `us.aba_routing`. +# - Dashed form `\b[0123678]\d{3}-\d{4}-\d\b` with the same +# checksum. + +name = "us-bank-routing" +label = "bank_routing" +countries = ["US"] +languages = ["en"] +context = [ + "routing", + "aba", + "rtn", + "transit", + "bank", + "deposit", + "wire", + "ach", +] + +[[variants]] +regex = '\b(?:0[1-9]|[12]\d|3[0-2])\d{7}\b' +score = 0.5 +validator = "us.aba_routing" + +[[variants]] +regex = '\b[0123678]\d{3}-\d{4}-\d\b' +score = 0.4 +validator = "us.aba_routing" diff --git a/crates/nvisy-pattern/assets/patterns/us/health/mbi.toml b/crates/nvisy-pattern/assets/patterns/us/health/mbi.toml new file mode 100644 index 00000000..1359e09d --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/health/mbi.toml @@ -0,0 +1,27 @@ +# US Medicare Beneficiary Identifier (MBI). +# +# 11-character CMS identifier with position-specific +# character-class rules and letters restricted to +# `ACDEFGHJKMNPQRTUVWXY` (excludes S, L, O, I, B, Z). + +name = "us-mbi" +label = "medical_id" +countries = ["US"] +languages = ["en"] +context = [ + "mbi", + "medicare beneficiary", + "medicare", + "beneficiary", + "cms", +] + +# 11-character form (no dashes), positions: N A AN N A AN N A A N N. +[[variants]] +regex = '\b[0-9][ACDEFGHJKMNPQRTUVWXY][0-9ACDEFGHJKMNPQRTUVWXY][0-9][ACDEFGHJKMNPQRTUVWXY][0-9ACDEFGHJKMNPQRTUVWXY][0-9][ACDEFGHJKMNPQRTUVWXY][ACDEFGHJKMNPQRTUVWXY][0-9][0-9]\b' +score = 0.3 + +# Dashed display form `XXXX-XXX-XXXX`. +[[variants]] +regex = '\b[0-9][ACDEFGHJKMNPQRTUVWXY][0-9ACDEFGHJKMNPQRTUVWXY][0-9]-[ACDEFGHJKMNPQRTUVWXY][0-9ACDEFGHJKMNPQRTUVWXY][0-9]-[ACDEFGHJKMNPQRTUVWXY][ACDEFGHJKMNPQRTUVWXY][0-9][0-9]\b' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml b/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml new file mode 100644 index 00000000..80e80390 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml @@ -0,0 +1,36 @@ +# US DEA (Drug Enforcement Administration) registration number, +# used as a medical license identifier. +# +# 9-character format: two letters (DEA registration type plus +# surname initial) followed by 7 digits, the last of which is a +# checksum. Without context the pattern has a high false-positive +# rate (matches arbitrary letter-pair plus digit-run strings), so +# the regex score is low and the boost layer is expected to lift +# legitimate matches. + +name = "us-medical-license" +label = "medical_id" +countries = ["US"] +languages = ["en"] +context = [ + "medical", + "license", + "licence", + "certificate", + "dea", + "controlled substance", + "prescription", + "prescriber", +] + +# DEA registration type letters: A, B, C, D, E, F, G, H, J, K, L, +# M, P, R, S, T, U, X (plus mid-2000s practitioner-9 series). +[[variants]] +regex = '[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX][a-zA-Z]\d{7}' +score = 0.4 +validator = "us.dea_number" + +[[variants]] +regex = '[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX]9\d{7}' +score = 0.4 +validator = "us.dea_number" diff --git a/crates/nvisy-pattern/assets/patterns/us/health/npi.toml b/crates/nvisy-pattern/assets/patterns/us/health/npi.toml new file mode 100644 index 00000000..0c657209 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/health/npi.toml @@ -0,0 +1,28 @@ +# US National Provider Identifier (NPI). +# +# 10-digit number assigned by CMS; validated via Luhn checksum on +# `80840` + the 10 digits. + +name = "us-npi" +label = "medical_id" +countries = ["US"] +languages = ["en"] +context = [ + "npi", + "national provider", + "provider", + "npi number", + "provider id", + "provider identifier", + "taxonomy", +] + +[[variants]] +regex = '\b[12]\d{9}\b' +score = 0.1 +validator = "us.npi" + +[[variants]] +regex = '\b[12]\d{3}[ -]\d{3}[ -]\d{3}\b' +score = 0.4 +validator = "us.npi" diff --git a/crates/nvisy-pattern/assets/patterns/us/identity/drivers_license.toml b/crates/nvisy-pattern/assets/patterns/us/identity/drivers_license.toml new file mode 100644 index 00000000..a72501ee --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/identity/drivers_license.toml @@ -0,0 +1,26 @@ +# US Driver's License (state-shape union). +# +# Omits a bare-digit weak variant (`\b[0-9]{6,14}\b`) since it +# matches phone numbers, dates, etc. and is unusable without +# strong context. + +name = "us-drivers-license" +label = "drivers_license" +countries = ["US"] +languages = ["en"] +context = [ + "driver", + "drivers", + "driver's", + "license", + "licence", + "dl", + "dl#", + "dlnum", +] + +# Alphanumeric shapes — most US state license formats start with +# letters, often with a hyphenated structure (e.g. MI: D123-4567-8901). +[[variants]] +regex = '\b(?:[A-Z]\d{3}-\d{4}-\d{4}|[A-Z]\d{3,6}|[A-Z]\d{5,9}|[A-Z]\d{6,8}|[A-Z]\d{4,8}|[A-Z]\d{9,11}|[A-Z]{1,2}\d{5,6}|H\d{8}|V\d{6}|X\d{8}|[A-Z]{2}\d{2,5}|[A-Z]{2}\d{3,7}|\d{2}[A-Z]{3}\d{5,6}|[A-Z]\d{13,14}|[A-Z]\d{18}|[A-Z]\d{6}R|[A-Z]\d{9}|[A-Z]\d{1,12}|\d{9}[A-Z]|[A-Z]{2}\d{6}[A-Z]|\d{8}[A-Z]{2}|\d{3}[A-Z]{2}\d{4}|[A-Z]\d[A-Z]\d[A-Z]|\d{7,8}[A-Z])\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/us/identity/itin.toml b/crates/nvisy-pattern/assets/patterns/us/identity/itin.toml new file mode 100644 index 00000000..726861f5 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/identity/itin.toml @@ -0,0 +1,28 @@ +# US Individual Taxpayer Identification Number (ITIN). +# +# Format: 9NN-MM-NNNN where the middle group restricts to the +# IRS-published ranges. No checksum. + +name = "us-itin" +label = "tax_id" +countries = ["US"] +languages = ["en"] +context = [ + "individual", + "taxpayer", + "itin", + "tax", + "payer", + "taxid", + "tin", +] + +# Medium: dashes / spaces around the IRS-published middle range. +[[variants]] +regex = '\b9\d{2}[- ](?:5\d|6[0-5]|7\d|8[0-8]|9(?:[0-2]|[4-9]))[- ]\d{4}\b' +score = 0.5 + +# Weak: unbroken 9-digit form. +[[variants]] +regex = '\b9\d{2}(?:5\d|6[0-5]|7\d|8[0-8]|9(?:[0-2]|[4-9]))\d{4}\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml b/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml similarity index 74% rename from crates/nvisy-pattern/assets/patterns/identity/us_passport.toml rename to crates/nvisy-pattern/assets/patterns/us/identity/passport.toml index d7087d83..439529a6 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml +++ b/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml @@ -1,6 +1,7 @@ name = "us-passport" label = "passport_number" +countries = ["US"] [[variants]] regex = "\\b[A-Z]\\d{8}\\b" -score = 0.5 +score = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml b/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml similarity index 84% rename from crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml rename to crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml index 737b391f..53ee38c9 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml +++ b/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml @@ -1,5 +1,6 @@ name = "us-postal-code" label = "postal_code" +countries = ["US"] [[variants]] regex = "\\b\\d{5}(?:-\\d{4})?\\b" diff --git a/crates/nvisy-pattern/assets/patterns/us/identity/ssn.toml b/crates/nvisy-pattern/assets/patterns/us/identity/ssn.toml new file mode 100644 index 00000000..18b11034 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/us/identity/ssn.toml @@ -0,0 +1,14 @@ +# US Social Security Number. +# +# Omits very-weak variants (bare 9 digits, off-position splits). +# Context-keyword boost is expected to lift legitimate matches. + +name = "ssn" +label = "government_id" +countries = ["US"] +context = ["social security", "ssn", "tax id", "taxpayer identification"] + +[[variants]] +regex = '\b(\d{3})[- .](\d{2})[- .](\d{4})\b' +score = 0.5 +validator = "us.ssn" diff --git a/crates/nvisy-pattern/assets/patterns/world/contact/email.toml b/crates/nvisy-pattern/assets/patterns/world/contact/email.toml new file mode 100644 index 00000000..68114a4c --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/world/contact/email.toml @@ -0,0 +1,12 @@ +# Email address (RFC 5322-loose). +# +# Accepts the RFC 5322 atom characters in the local part +# (`!#$%&'*+\-/=?^_`{|}~`) so plus-addressing, dotted locals, and +# similar are caught. + +name = "email" +label = "email_address" + +[[variants]] +regex = '''\b(?:[!#$%&'*+\-/=?^_`{|}~\w]|[!#$%&'*+\-/=?^_`{|}~\w][!#$%&'*+\-/=?^_`{|}~.\w]{0,}[!#$%&'*+\-/=?^_`{|}~\w])@\w+(?:[-.]\w+)*\.\w+(?:[-.]\w+)*\b''' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/contact/phone.toml b/crates/nvisy-pattern/assets/patterns/world/contact/phone.toml similarity index 97% rename from crates/nvisy-pattern/assets/patterns/contact/phone.toml rename to crates/nvisy-pattern/assets/patterns/world/contact/phone.toml index ce5fe9dd..83e0bc96 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/phone.toml +++ b/crates/nvisy-pattern/assets/patterns/world/contact/phone.toml @@ -9,5 +9,5 @@ fr = ["téléphone", "telephone", "appel", "mobile", "portable", "tel", "fax", " [[variants]] regex = "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b" -score = 0.8 +score = 0.4 validator = "phone" diff --git a/crates/nvisy-pattern/assets/patterns/contact/url.toml b/crates/nvisy-pattern/assets/patterns/world/contact/url.toml similarity index 87% rename from crates/nvisy-pattern/assets/patterns/contact/url.toml rename to crates/nvisy-pattern/assets/patterns/world/contact/url.toml index ec11fcee..7096570a 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/url.toml +++ b/crates/nvisy-pattern/assets/patterns/world/contact/url.toml @@ -3,4 +3,4 @@ label = "url" [[variants]] regex = "\\bhttps?://[^\\s/$.?#][^\\s]*\\b" -score = 0.9 +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml similarity index 100% rename from crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml rename to crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml diff --git a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml similarity index 94% rename from crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml rename to crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml index be69abc5..bcd59d63 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml +++ b/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml @@ -3,4 +3,4 @@ label = "api_key" [[variants]] regex = "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?" -score = 0.7 +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml similarity index 100% rename from crates/nvisy-pattern/assets/patterns/credentials/github_token.toml rename to crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml diff --git a/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml similarity index 100% rename from crates/nvisy-pattern/assets/patterns/credentials/private_key.toml rename to crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml diff --git a/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/stripe_key.toml similarity index 100% rename from crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml rename to crates/nvisy-pattern/assets/patterns/world/credentials/stripe_key.toml diff --git a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml b/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml similarity index 91% rename from crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml rename to crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml index a68c435a..31f6fad6 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml @@ -3,4 +3,4 @@ label = "crypto_address" [[variants]] regex = "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b" -score = 0.85 +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml b/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml similarity index 55% rename from crates/nvisy-pattern/assets/patterns/finance/credit_card.toml rename to crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml index cfe4ab2b..d3412b4f 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml @@ -1,3 +1,9 @@ +# Credit / debit card number. +# +# Brand-prefix variants matching Visa, Mastercard, Discover, etc. +# Luhn-validated. The Luhn validator drops false positives that +# a length-based fallback regex would otherwise pick up. + name = "credit-card" label = "payment_card" @@ -8,6 +14,6 @@ de = ["karte", "kredit", "kreditkarte", "debit", "zahlung", "visa", "mastercard" fr = ["carte", "crédit", "credit", "débit", "debit", "paiement", "visa", "mastercard", "amex"] [[variants]] -regex = "\\b(?:\\d[ \\-]*?){13,19}\\b" -score = 0.85 +regex = '\b(?:(?:4\d{3})|(?:5[0-5]\d{2})|(?:6\d{3})|(?:1\d{3})|(?:3\d{3}))[- ]?(?:\d{3,4})[- ]?(?:\d{3,4})[- ]?(?:\d{3,5})\b' +score = 0.5 validator = "luhn" diff --git a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml b/crates/nvisy-pattern/assets/patterns/world/finance/ethereum_address.toml similarity index 88% rename from crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml rename to crates/nvisy-pattern/assets/patterns/world/finance/ethereum_address.toml index 2860d8a4..939ed818 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/ethereum_address.toml @@ -3,4 +3,4 @@ label = "crypto_address" [[variants]] regex = "\\b0x[0-9a-fA-F]{40}\\b" -score = 0.85 +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/finance/iban.toml b/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml similarity index 93% rename from crates/nvisy-pattern/assets/patterns/finance/iban.toml rename to crates/nvisy-pattern/assets/patterns/world/finance/iban.toml index 7256b240..3680ee9e 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/iban.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml @@ -4,5 +4,5 @@ context = ["iban", "bank", "account", "transfer", "swift"] [[variants]] regex = "\\b[A-Z]{2}\\d{2}\\s?[A-Z0-9]{4}\\s?(?:\\d{4}\\s?){2,7}\\d{1,4}\\b" -score = 0.85 +score = 0.5 validator = "iban" diff --git a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml b/crates/nvisy-pattern/assets/patterns/world/finance/swift_code.toml similarity index 90% rename from crates/nvisy-pattern/assets/patterns/finance/swift_code.toml rename to crates/nvisy-pattern/assets/patterns/world/finance/swift_code.toml index 7147b65c..e1f1a60c 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/swift_code.toml @@ -3,4 +3,4 @@ label = "swift_code" [[variants]] regex = "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b" -score = 0.7 +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/world/network/ipv4.toml b/crates/nvisy-pattern/assets/patterns/world/network/ipv4.toml new file mode 100644 index 00000000..926d8890 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/world/network/ipv4.toml @@ -0,0 +1,8 @@ +# IPv4 address with optional CIDR suffix. + +name = "ipv4" +label = "ip_address" + +[[variants]] +regex = '\b(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)(?:/(?:[0-2]?\d|3[0-2]))?\b' +score = 0.6 diff --git a/crates/nvisy-pattern/assets/patterns/world/network/ipv6.toml b/crates/nvisy-pattern/assets/patterns/world/network/ipv6.toml new file mode 100644 index 00000000..5ff02317 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/world/network/ipv6.toml @@ -0,0 +1,25 @@ +# IPv6 address (comprehensive form set). +# +# Rust's `regex` crate doesn't support look-around, so the +# variants use `\b` boundaries to limit match overlap inside +# larger IPv6-shaped strings. Over-matching at sub-token +# boundaries is rare in practice and gets collapsed by the entity +# dedup pipeline downstream. + +name = "ipv6" +label = "ip_address" + +# IPv4-mapped IPv6 (::ffff:0:0:1.2.3.4 et al). +[[variants]] +regex = '\b::(?:ffff(?::0{1,4})?:)?(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:/(?:12[0-8]|1[01]\d|[1-9]?\d))?\b' +score = 0.6 + +# IPv4-embedded IPv6 (X:X:…:1.2.3.4). +[[variants]] +regex = '\b(?:(?:[0-9A-Fa-f]{1,4}:){1,5}:(?:[0-9A-Fa-f]{1,4}:){0,4}|(?:[0-9A-Fa-f]{1,4}:){6})(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?:/(?:12[0-8]|1[01]\d|[1-9]?\d))?\b' +score = 0.6 + +# Standard IPv6 with all compressed forms. +[[variants]] +regex = '\b(?:(?:[0-9A-Fa-f]{1,4}:){7}[0-9A-Fa-f]{1,4}|(?:[0-9A-Fa-f]{1,4}:){1,7}:|:(?::[0-9A-Fa-f]{1,4}){1,7}|(?:[0-9A-Fa-f]{1,4}:){1,6}:[0-9A-Fa-f]{1,4}|(?:[0-9A-Fa-f]{1,4}:){1,5}(?::[0-9A-Fa-f]{1,4}){1,2}|(?:[0-9A-Fa-f]{1,4}:){1,4}(?::[0-9A-Fa-f]{1,4}){1,3}|(?:[0-9A-Fa-f]{1,4}:){1,3}(?::[0-9A-Fa-f]{1,4}){1,4}|(?:[0-9A-Fa-f]{1,4}:){1,2}(?::[0-9A-Fa-f]{1,4}){1,5}|[0-9A-Fa-f]{1,4}:(?::[0-9A-Fa-f]{1,4}){1,6}|:(?::[0-9A-Fa-f]{1,4}){1,6})(?:%[0-9a-zA-Z]+)?(?:/(?:12[0-8]|1[01]\d|[1-9]?\d))?\b' +score = 0.6 diff --git a/crates/nvisy-pattern/assets/patterns/world/network/mac_address.toml b/crates/nvisy-pattern/assets/patterns/world/network/mac_address.toml new file mode 100644 index 00000000..c7425050 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/world/network/mac_address.toml @@ -0,0 +1,16 @@ +# MAC (Ethernet) address. +# +# Two variants: +# - Colon/hyphen form: `00:1A:2B:3C:4D:5E` or `00-1A-2B-3C-4D-5E`. +# - Cisco dot-form: `0000.0000.0000` (three 4-hex-digit groups). + +name = "mac-address" +label = "mac_address" + +[[variants]] +regex = "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b" +score = 0.5 + +[[variants]] +regex = "\\b[0-9A-Fa-f]{4}\\.[0-9A-Fa-f]{4}\\.[0-9A-Fa-f]{4}\\b" +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml b/crates/nvisy-pattern/assets/patterns/world/personal/date_of_birth.toml similarity index 97% rename from crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml rename to crates/nvisy-pattern/assets/patterns/world/personal/date_of_birth.toml index a178c41d..0045927b 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml +++ b/crates/nvisy-pattern/assets/patterns/world/personal/date_of_birth.toml @@ -9,5 +9,5 @@ fr = ["naissance", "né", "nee", "née", "date de naissance", "anniversaire"] [[variants]] regex = "\\b(?:(?:0[1-9]|1[0-2]|[1-9])[/\\-](?:0[1-9]|[12]\\d|3[01]|[1-9])[/\\-](?:19|20)\\d{2}|(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01]))\\b" -score = 0.6 +score = 0.1 validator = "date" diff --git a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml b/crates/nvisy-pattern/assets/patterns/world/personal/datetime.toml similarity index 98% rename from crates/nvisy-pattern/assets/patterns/personal/datetime.toml rename to crates/nvisy-pattern/assets/patterns/world/personal/datetime.toml index e37edf92..76d0e1df 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml +++ b/crates/nvisy-pattern/assets/patterns/world/personal/datetime.toml @@ -9,4 +9,4 @@ fr = ["horodatage", "créé", "cree", "créée", "creee", "modifié", "modifie", [[variants]] regex = "\\b(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[T ](?:[01]\\d|2[0-3]):[0-5]\\d(?::[0-5]\\d)?(?:Z|[+\\-]\\d{2}:?\\d{2})?\\b" -score = 0.7 +score = 0.1 diff --git a/crates/nvisy-pattern/src/recognition/compiled.rs b/crates/nvisy-pattern/src/recognition/compiled.rs index d1a61556..c9fe8d6e 100644 --- a/crates/nvisy-pattern/src/recognition/compiled.rs +++ b/crates/nvisy-pattern/src/recognition/compiled.rs @@ -19,7 +19,7 @@ use std::sync::Arc; use nvisy_core::entity::{Entity, EntityLabelRef, PatternProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Text, TextLocation}; -use nvisy_core::primitive::{Confidence, LanguageTag}; +use nvisy_core::primitive::{Confidence, CountryCode, LanguageTag}; use regex::Regex; use crate::validators::Validator; @@ -44,6 +44,9 @@ pub(super) struct CompiledPattern { /// Languages the parent pattern applies to. /// Empty means "any language". pub languages: Vec, + /// Countries the parent pattern applies to. + /// Empty means "any country". + pub countries: Vec, } impl CompiledPattern { @@ -93,6 +96,9 @@ pub(super) struct CompiledDictionary { /// Languages this dictionary applies to. Empty means "any /// language". pub languages: Vec, + /// Countries this dictionary applies to. Empty means "any + /// country". + pub countries: Vec, /// Reject matches whose immediate neighbours are word /// characters (alphanumeric or `_`). Mirrors regex `\b`. pub word_boundary: bool, diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index 3be046c6..e5acdce5 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -3,7 +3,7 @@ use derive_builder::Builder; use nvisy_core::Error; use nvisy_core::entity::EntityLabelRef; -use nvisy_core::primitive::{Confidence, LanguageTag}; +use nvisy_core::primitive::{Confidence, CountryCode, LanguageTag}; use serde::Deserialize; use super::context::Context; @@ -142,6 +142,12 @@ pub struct Dictionary { #[builder(default)] #[serde(default)] pub languages: Vec, + /// ISO 3166-1 alpha-2 country codes the dictionary applies + /// to. Empty means "any country" — the dictionary fires + /// regardless of the per-call jurisdiction hint. + #[builder(default)] + #[serde(default)] + pub countries: Vec, /// Require word-boundary surroundings on every match. /// /// With the default of `true`, the term `"am"` matches the @@ -213,6 +219,12 @@ impl Dictionary { if let Some(wb) = metadata.word_boundary { builder = builder.with_word_boundary(wb); } + if let Some(languages) = metadata.languages { + builder = builder.with_languages(languages); + } + if let Some(countries) = metadata.countries { + builder = builder.with_countries(countries); + } Ok(builder) } } @@ -227,4 +239,8 @@ struct DictionaryMetadata { context: Option, #[serde(default)] word_boundary: Option, + #[serde(default)] + languages: Option>, + #[serde(default)] + countries: Option>, } diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index 3423e80b..9f6bbb3c 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -269,6 +269,7 @@ impl PatternRecognizerBuilder { score: variant.score, validator, languages: pattern.languages.clone(), + countries: pattern.countries.clone(), }); } } @@ -335,6 +336,7 @@ impl PatternRecognizerBuilder { term_end, term_scores, languages: dict.languages.clone(), + countries: dict.countries.clone(), word_boundary: dict.word_boundary, }); } @@ -429,6 +431,9 @@ impl EntityRecognizer for PatternRecognizer { if !input.applies_to_language(&pat.languages) { continue; } + if !input.applies_to_country(&pat.countries) { + continue; + } for m in pat.regex.find_iter(text) { if let Some(validator) = pat.validator.as_ref() && !validator.validate(m.as_str()) @@ -449,6 +454,9 @@ impl EntityRecognizer for PatternRecognizer { if !input.applies_to_language(&dict.languages) { continue; } + if !input.applies_to_country(&dict.countries) { + continue; + } if dict.word_boundary && !has_word_boundaries(text, mat.start(), mat.end()) { continue; } @@ -467,7 +475,7 @@ mod tests { use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::{Text, TextData}; - use nvisy_core::primitive::Confidence; + use nvisy_core::primitive::{Confidence, CountryCode}; use nvisy_core::recognition::RecognizerInput; use super::*; @@ -555,6 +563,77 @@ mod tests { assert_eq!(map.len(), 2); } + #[test] + fn regex_omits_countries_by_default() { + let toml = r#" + name = "x" + label = "government_id" + [[variants]] + regex = "\\d+" + "#; + let regex = crate::Regex::from_toml(toml).expect("TOML parses"); + assert!(regex.countries.is_empty(), "default countries must be empty"); + } + + #[test] + fn regex_parses_countries_field() { + let toml = r#" + name = "ssn" + label = "government_id" + countries = ["US"] + [[variants]] + regex = "\\d+" + "#; + let regex = crate::Regex::from_toml(toml).expect("TOML parses"); + assert_eq!(regex.countries.len(), 1); + assert_eq!(regex.countries[0].as_str(), "US"); + } + + #[test] + fn regex_parses_multiple_countries() { + let toml = r#" + name = "eu-vat" + label = "tax_id" + countries = ["de", "FR", "iT"] + [[variants]] + regex = "\\d+" + "#; + let regex = crate::Regex::from_toml(toml).expect("TOML parses"); + assert_eq!(regex.countries.len(), 3); + // Construction normalises to uppercase. + let codes: Vec<&str> = regex.countries.iter().map(CountryCode::as_str).collect(); + assert_eq!(codes, vec!["DE", "FR", "IT"]); + } + + #[test] + fn regex_rejects_invalid_country() { + let toml = r#" + name = "x" + label = "government_id" + countries = ["XZ"] + [[variants]] + regex = "\\d+" + "#; + assert!( + crate::Regex::from_toml(toml).is_err(), + "unassigned country code must error", + ); + } + + #[test] + fn regex_builder_accepts_countries() { + let variant = crate::Variant::new(r"\d{3}-\d{2}-\d{4}").unwrap(); + let regex = crate::Regex::builder() + .with_name("ssn") + .with_label(builtins::GOVERNMENT_ID.label_ref()) + .with_variants(vec![variant]) + .with_countries(vec![CountryCode::new("US").unwrap()]) + .build() + .expect("regex builds"); + assert_eq!(regex.countries.len(), 1); + assert_eq!(regex.countries[0].as_str(), "US"); + } + async fn run_with_language( recognizer: &impl EntityRecognizer, text: &str, @@ -571,6 +650,75 @@ mod tests { .entities } + async fn run_with_country( + recognizer: &impl EntityRecognizer, + text: &str, + country: Option<&str>, + ) -> Vec> { + let mut input = RecognizerInput::new(TextData::new(text.to_owned())); + if let Some(c) = country { + input = input.with_country(CountryCode::new(c).expect("country code parses")); + } + recognizer + .recognize(&input) + .await + .expect("recognize succeeds") + .entities + } + + fn us_ssn_regex() -> crate::Regex { + let variant = crate::Variant::new(r"\b\d{3}-\d{2}-\d{4}\b") + .expect("variant builds") + .with_score(Confidence::clamped(0.5)); + crate::Regex::builder() + .with_name("ssn") + .with_label(builtins::GOVERNMENT_ID.label_ref()) + .with_variants(vec![variant]) + .with_countries(vec![CountryCode::new("US").unwrap()]) + .build() + .expect("regex builds") + } + + #[tokio::test] + async fn country_scoped_rule_fires_under_matching_hint() { + let recognizer = PatternRecognizer::builder() + .with_pattern(us_ssn_regex()) + .build() + .expect("recognizer builds"); + let entities = run_with_country(&recognizer, "SSN: 123-45-6789", Some("US")).await; + assert_eq!(entities.len(), 1, "US-scoped rule must fire under US hint"); + } + + #[tokio::test] + async fn country_scoped_rule_skipped_under_non_matching_hint() { + let recognizer = PatternRecognizer::builder() + .with_pattern(us_ssn_regex()) + .build() + .expect("recognizer builds"); + let entities = run_with_country(&recognizer, "SSN: 123-45-6789", Some("GB")).await; + assert!( + entities.is_empty(), + "US-scoped rule must not fire under GB hint", + ); + } + + #[tokio::test] + async fn country_scoped_rule_fires_without_hint() { + // Permissive fallback: missing hint shouldn't drop the + // detection. Matches the existing `applies_to_language` + // semantic. + let recognizer = PatternRecognizer::builder() + .with_pattern(us_ssn_regex()) + .build() + .expect("recognizer builds"); + let entities = run_with_country(&recognizer, "SSN: 123-45-6789", None).await; + assert_eq!( + entities.len(), + 1, + "missing country hint must permit US-scoped rule to run", + ); + } + fn per_language_credit_card_regex() -> crate::Regex { let variant = crate::Variant::new(r"\b\d{16}\b") .expect("variant builds") diff --git a/crates/nvisy-pattern/src/recognition/regex.rs b/crates/nvisy-pattern/src/recognition/regex.rs index 6c926ff4..0690ce34 100644 --- a/crates/nvisy-pattern/src/recognition/regex.rs +++ b/crates/nvisy-pattern/src/recognition/regex.rs @@ -3,7 +3,7 @@ use derive_builder::Builder; use nvisy_core::Error; use nvisy_core::entity::EntityLabelRef; -use nvisy_core::primitive::{Confidence, LanguageTag}; +use nvisy_core::primitive::{Confidence, CountryCode, LanguageTag}; use serde::Deserialize; use super::context::Context; @@ -92,11 +92,11 @@ fn default_score() -> Confidence { /// Regex detection rule: one label, optional keyword boosts, and /// one or more [`Variant`]s. /// -/// Mirrors the Presidio "pattern recognizer" shape — several regex -/// strategies for one entity type, plus a shared context-keyword -/// list. Every variant emits the same [`label`]; context keywords -/// are harvested by [`PatternRecognizer`] into a wrapping boost -/// layer and are never read by the rule itself. +/// A rule groups several regex strategies under a single entity +/// type plus a shared context-keyword list. Every variant emits +/// the same [`label`]; context keywords are harvested by +/// [`PatternRecognizer`] into a wrapping boost layer and are +/// never read by the rule itself. /// /// # Examples /// @@ -149,6 +149,14 @@ pub struct Regex { #[builder(default)] #[serde(default)] pub languages: Vec, + /// ISO 3166-1 alpha-2 country codes the rule applies to. + /// Empty means "any country" — the rule fires regardless of + /// the per-call jurisdiction hint. Use this to scope a + /// pattern to specific national formats (e.g. `["US"]` for + /// the SSN regex). + #[builder(default)] + #[serde(default)] + pub countries: Vec, } impl Regex { diff --git a/crates/nvisy-pattern/src/shipped/dictionaries.rs b/crates/nvisy-pattern/src/shipped/dictionaries.rs deleted file mode 100644 index dac8baa8..00000000 --- a/crates/nvisy-pattern/src/shipped/dictionaries.rs +++ /dev/null @@ -1,102 +0,0 @@ -//! Built-in [`Dictionary`]s, embedded at compile time. -//! -//! Each accessor pairs a TOML metadata sidecar -//! (`assets/dictionaries/**/*.toml`) with a term source (`*.csv` -//! for multi-column term lists, `*.txt` for one-per-line), merging -//! them via [`Dictionary::metadata_from_toml`] + [`Term::from_csv`] -//! / [`Term::from_text`]. -//! -//! [`Dictionary`]: crate::Dictionary - -use crate::recognition::{Dictionary, Term}; - -macro_rules! shipped_dictionary { - ($(#[$meta:meta])* fn $name:ident from $meta_path:literal with csv $terms:literal) => { - $(#[$meta])* - #[must_use] - pub fn $name() -> Dictionary { - let terms = Term::from_csv(include_str!(concat!( - "../../assets/dictionaries/", - $terms - ))) - .expect(concat!("shipped term source `", $terms, "` parses")); - Dictionary::metadata_from_toml(include_str!(concat!( - "../../assets/dictionaries/", - $meta_path - ))) - .expect(concat!("shipped metadata `", $meta_path, "` is well-formed")) - .with_terms(terms) - .build() - .expect(concat!("shipped dictionary `", $meta_path, "` builds")) - } - }; - ($(#[$meta:meta])* fn $name:ident from $meta_path:literal with text $terms:literal) => { - $(#[$meta])* - #[must_use] - pub fn $name() -> Dictionary { - let terms = Term::from_text(include_str!(concat!( - "../../assets/dictionaries/", - $terms - ))); - Dictionary::metadata_from_toml(include_str!(concat!( - "../../assets/dictionaries/", - $meta_path - ))) - .expect(concat!("shipped metadata `", $meta_path, "` is well-formed")) - .with_terms(terms) - .build() - .expect(concat!("shipped dictionary `", $meta_path, "` builds")) - } - }; -} - -shipped_dictionary!( - /// Cryptocurrency names and ticker symbols (BTC, Bitcoin, ETH, - /// Ethereum, …). - fn cryptocurrencies from "finance/cryptocurrencies.toml" with csv "finance/cryptocurrencies.csv" -); -shipped_dictionary!( - /// Fiat currency names and ISO 4217 codes (USD, US Dollar, EUR, - /// Euro, …). - fn currencies from "finance/currencies.toml" with csv "finance/currencies.csv" -); -shipped_dictionary!( - /// Human-language names and ISO 639 codes (English, en, - /// French, fr, …). - fn languages from "general/languages.toml" with csv "general/languages.csv" -); -shipped_dictionary!( - /// Demonyms and nationality terms (American, French, …). - fn nationalities from "general/nationalities.toml" with text "general/nationalities.txt" -); -shipped_dictionary!( - /// Religious affiliations (Christianity, Islam, …). - fn religions from "general/religions.toml" with text "general/religions.txt" -); - -/// Every built-in dictionary shipped by this crate, in arbitrary -/// stable order. -#[must_use] -pub fn all() -> Vec { - vec![ - cryptocurrencies(), - currencies(), - languages(), - nationalities(), - religions(), - ] -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn every_shipped_dictionary_parses() { - let dicts = all(); - assert_eq!(dicts.len(), 5); - for dict in &dicts { - assert!(!dict.terms.is_empty(), "{} has no terms", dict.name); - } - } -} diff --git a/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs b/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs new file mode 100644 index 00000000..2190100e --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs @@ -0,0 +1,77 @@ +//! Built-in [`Dictionary`]s, embedded at compile time. +//! +//! Accessors are grouped by region — `world::*` for universal +//! dictionaries; future country-specific dictionaries land in +//! `::*` sub-modules. Each pairs a TOML metadata sidecar +//! (`assets/dictionaries///*.toml`) with a term +//! source (`*.csv` for multi-column lists, `*.txt` for one-per-line), +//! merging them via [`Dictionary::metadata_from_toml`] + +//! [`crate::Term::from_csv`] / [`crate::Term::from_text`]. +//! +//! [`Dictionary`]: crate::Dictionary + +pub mod world; + +use crate::Dictionary; + +/// Helper used by every per-region sub-module to define a shipped +/// dictionary accessor. +/// +/// Paths are resolved with `include_str!` against the path of the +/// file that *expands* the macro, so callers in sub-modules pass +/// paths relative to themselves. +#[doc(hidden)] +#[macro_export] +macro_rules! __shipped_dictionary { + ($(#[$meta:meta])* fn $name:ident from $meta_path:literal with csv $terms:literal) => { + $(#[$meta])* + #[must_use] + pub fn $name() -> $crate::Dictionary { + let terms = $crate::Term::from_csv(include_str!($terms)) + .expect(concat!("shipped term source `", $terms, "` parses")); + $crate::Dictionary::metadata_from_toml(include_str!($meta_path)) + .expect(concat!("shipped metadata `", $meta_path, "` is well-formed")) + .with_terms(terms) + .build() + .expect(concat!("shipped dictionary `", $meta_path, "` builds")) + } + }; + ($(#[$meta:meta])* fn $name:ident from $meta_path:literal with text $terms:literal) => { + $(#[$meta])* + #[must_use] + pub fn $name() -> $crate::Dictionary { + let terms = $crate::Term::from_text(include_str!($terms)); + $crate::Dictionary::metadata_from_toml(include_str!($meta_path)) + .expect(concat!("shipped metadata `", $meta_path, "` is well-formed")) + .with_terms(terms) + .build() + .expect(concat!("shipped dictionary `", $meta_path, "` builds")) + } + }; +} + +/// Every built-in dictionary shipped by this crate, regardless of +/// region. +#[must_use] +pub fn all() -> Vec { + world::all() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn every_shipped_dictionary_parses() { + let dicts = all(); + assert_eq!(dicts.len(), 5); + for dict in &dicts { + assert!(!dict.terms.is_empty(), "{} has no terms", dict.name); + } + } + + #[test] + fn world_set_has_5_dictionaries() { + assert_eq!(world::all().len(), 5); + } +} diff --git a/crates/nvisy-pattern/src/shipped/dictionaries/world.rs b/crates/nvisy-pattern/src/shipped/dictionaries/world.rs new file mode 100644 index 00000000..9941d11b --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/dictionaries/world.rs @@ -0,0 +1,50 @@ +//! Universal dictionaries — apply regardless of jurisdiction. + +use crate::Dictionary; +use crate::__shipped_dictionary as shipped_dictionary; + +shipped_dictionary!( + /// Cryptocurrency names and ticker symbols (BTC, Bitcoin, ETH, + /// Ethereum, …). + fn cryptocurrencies + from "../../../assets/dictionaries/world/finance/cryptocurrencies.toml" + with csv "../../../assets/dictionaries/world/finance/cryptocurrencies.csv" +); +shipped_dictionary!( + /// Fiat currency names and ISO 4217 codes (USD, US Dollar, + /// EUR, Euro, …). + fn currencies + from "../../../assets/dictionaries/world/finance/currencies.toml" + with csv "../../../assets/dictionaries/world/finance/currencies.csv" +); +shipped_dictionary!( + /// Human-language names and ISO 639 codes (English, en, + /// French, fr, …). + fn languages + from "../../../assets/dictionaries/world/personal/languages.toml" + with csv "../../../assets/dictionaries/world/personal/languages.csv" +); +shipped_dictionary!( + /// Demonyms and nationality terms (American, French, …). + fn nationalities + from "../../../assets/dictionaries/world/personal/nationalities.toml" + with text "../../../assets/dictionaries/world/personal/nationalities.txt" +); +shipped_dictionary!( + /// Religious affiliations (Christianity, Islam, …). + fn religions + from "../../../assets/dictionaries/world/personal/religions.toml" + with text "../../../assets/dictionaries/world/personal/religions.txt" +); + +/// Every world-scoped built-in dictionary. +#[must_use] +pub fn all() -> Vec { + vec![ + cryptocurrencies(), + currencies(), + languages(), + nationalities(), + religions(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns.rs b/crates/nvisy-pattern/src/shipped/patterns.rs deleted file mode 100644 index f9fc4140..00000000 --- a/crates/nvisy-pattern/src/shipped/patterns.rs +++ /dev/null @@ -1,162 +0,0 @@ -//! Built-in [`Regex`] rules, embedded at compile time. -//! -//! Each accessor returns a fresh [`Regex`] parsed from a TOML -//! definition file under `assets/patterns/`. The parse happens on -//! every call — rules are cheap to construct since -//! [`PatternRecognizer::build`] does the heavy compilation. -//! -//! [`Regex`]: crate::Regex -//! [`PatternRecognizer::build`]: crate::PatternRecognizer - -use crate::Regex; - -macro_rules! shipped_pattern { - ($(#[$meta:meta])* fn $name:ident from $path:literal) => { - $(#[$meta])* - #[must_use] - pub fn $name() -> Regex { - Regex::from_toml(include_str!(concat!("../../assets/patterns/", $path))) - .expect(concat!("shipped pattern `", $path, "` is well-formed")) - } - }; -} - -shipped_pattern!( - /// Email address (RFC-loose). - fn email from "contact/email.toml" -); -shipped_pattern!( - /// International phone numbers. - fn phone from "contact/phone.toml" -); -shipped_pattern!( - /// URLs (HTTP/HTTPS/FTP). - fn url from "contact/url.toml" -); - -shipped_pattern!( - /// AWS access key IDs. - fn aws_key from "credentials/aws_key.toml" -); -shipped_pattern!( - /// Heuristic generic API key. - fn generic_api_key from "credentials/generic_api_key.toml" -); -shipped_pattern!( - /// GitHub personal access tokens. - fn github_token from "credentials/github_token.toml" -); -shipped_pattern!( - /// PEM-formatted private keys. - fn private_key from "credentials/private_key.toml" -); -shipped_pattern!( - /// Stripe live/test secret keys. - fn stripe_key from "credentials/stripe_key.toml" -); - -shipped_pattern!( - /// Bitcoin (legacy + bech32) addresses. - fn bitcoin_address from "finance/bitcoin_address.toml" -); -shipped_pattern!( - /// Credit-card numbers, Luhn-validated. - fn credit_card from "finance/credit_card.toml" -); -shipped_pattern!( - /// Ethereum addresses. - fn ethereum_address from "finance/ethereum_address.toml" -); -shipped_pattern!( - /// International Bank Account Numbers. - fn iban from "finance/iban.toml" -); -shipped_pattern!( - /// SWIFT / BIC codes. - fn swift_code from "finance/swift_code.toml" -); -shipped_pattern!( - /// US bank routing numbers (ABA RTN). - fn us_bank_routing from "finance/us_bank_routing.toml" -); - -shipped_pattern!( - /// US Social Security numbers (AAA-GG-SSSS). - fn ssn from "identity/ssn.toml" -); -shipped_pattern!( - /// US driver's license numbers. - fn us_drivers_license from "identity/us_drivers_license.toml" -); -shipped_pattern!( - /// US passport numbers. - fn us_passport from "identity/us_passport.toml" -); -shipped_pattern!( - /// US ZIP and ZIP+4 postal codes. - fn us_postal_code from "identity/us_postal_code.toml" -); - -shipped_pattern!( - /// IPv4 addresses. - fn ipv4 from "network/ipv4.toml" -); -shipped_pattern!( - /// IPv6 addresses. - fn ipv6 from "network/ipv6.toml" -); -shipped_pattern!( - /// MAC (Ethernet) addresses. - fn mac_address from "network/mac_address.toml" -); - -shipped_pattern!( - /// Date of birth in common written formats. - fn date_of_birth from "personal/date_of_birth.toml" -); -shipped_pattern!( - /// Date + time stamps in ISO-like formats. - fn datetime from "personal/datetime.toml" -); - -/// Every built-in pattern shipped by this crate, in arbitrary -/// stable order. -#[must_use] -pub fn all() -> Vec { - vec![ - email(), - phone(), - url(), - aws_key(), - generic_api_key(), - github_token(), - private_key(), - stripe_key(), - bitcoin_address(), - credit_card(), - ethereum_address(), - iban(), - swift_code(), - us_bank_routing(), - ssn(), - us_drivers_license(), - us_passport(), - us_postal_code(), - ipv4(), - ipv6(), - mac_address(), - date_of_birth(), - datetime(), - ] -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn every_shipped_pattern_parses() { - let patterns = all(); - assert_eq!(patterns.len(), 23); - } -} diff --git a/crates/nvisy-pattern/src/shipped/patterns/mod.rs b/crates/nvisy-pattern/src/shipped/patterns/mod.rs new file mode 100644 index 00000000..4ee09223 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/mod.rs @@ -0,0 +1,108 @@ +//! Built-in [`Regex`] rules, embedded at compile time. +//! +//! Accessors are grouped by region — `world::*` for universal +//! patterns, `::*` (e.g. `us::*`, `uk::*`) for +//! country-specific ones. Each returns a fresh [`Regex`] parsed +//! from a TOML definition under +//! `assets/patterns///`. The parse happens on +//! every call — rules are cheap to construct since +//! [`PatternRecognizer::build`] does the heavy compilation. +//! +//! [`Regex`]: crate::Regex +//! [`PatternRecognizer::build`]: crate::PatternRecognizer + +pub mod uk; +pub mod us; +pub mod world; + +use crate::Regex; + +/// Helper used by every per-region sub-module to define a shipped +/// pattern accessor. +/// +/// The `$path` is resolved with `include_str!` against the path +/// of the file that *expands* the macro, so callers in sub-modules +/// (e.g. `world.rs`) pass paths relative to themselves. +#[doc(hidden)] +#[macro_export] +macro_rules! __shipped_pattern { + ($(#[$meta:meta])* fn $name:ident from $path:literal) => { + $(#[$meta])* + #[must_use] + pub fn $name() -> $crate::Regex { + $crate::Regex::from_toml(include_str!($path)) + .expect(concat!("shipped pattern `", $path, "` is well-formed")) + } + }; +} + +/// Every built-in pattern shipped by this crate, regardless of +/// region. +#[must_use] +pub fn all() -> Vec { + let mut out = world::all(); + out.extend(us::all()); + out.extend(uk::all()); + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn every_shipped_pattern_parses() { + let patterns = all(); + assert_eq!(patterns.len(), 34); + } + + #[test] + fn world_set_has_18_patterns() { + assert_eq!(world::all().len(), 18); + } + + #[test] + fn us_set_has_10_patterns() { + assert_eq!(us::all().len(), 10); + } + + #[test] + fn uk_set_has_6_patterns() { + assert_eq!(uk::all().len(), 6); + } + + #[test] + fn world_patterns_have_no_country_scope() { + for pattern in world::all() { + assert!( + pattern.countries.is_empty(), + "world-scoped pattern `{}` must not declare countries", + pattern.name, + ); + } + } + + #[test] + fn us_patterns_are_country_scoped_to_us() { + for pattern in us::all() { + assert_eq!( + pattern.countries.iter().map(|c| c.as_str()).collect::>(), + vec!["US"], + "US-scoped pattern `{}` must declare countries = [US]", + pattern.name, + ); + } + } + + #[test] + fn uk_patterns_are_country_scoped_to_gb() { + for pattern in uk::all() { + assert_eq!( + pattern.countries.iter().map(|c| c.as_str()).collect::>(), + vec!["GB"], + "UK-scoped pattern `{}` must declare countries = [GB]", + pattern.name, + ); + } + } +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/uk.rs b/crates/nvisy-pattern/src/shipped/patterns/uk.rs new file mode 100644 index 00000000..d10dcd01 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/uk.rs @@ -0,0 +1,46 @@ +//! United Kingdom — patterns scoped to UK jurisdictional formats. +//! +//! See `assets/NOTICE.md` for third-party attribution. + +use crate::Regex; +use crate::__shipped_pattern as shipped_pattern; + +shipped_pattern!( + /// UK NHS numbers (10-digit, mod-11 validated). + fn nhs from "../../../assets/patterns/uk/identity/nhs.toml" +); +shipped_pattern!( + /// UK National Insurance numbers (NINO). + fn nino from "../../../assets/patterns/uk/identity/nino.toml" +); +shipped_pattern!( + /// UK driving licence numbers (DVLA 16-character). + fn driving_licence from "../../../assets/patterns/uk/identity/driving_licence.toml" +); +shipped_pattern!( + /// UK postcodes (BS7666 format, plus GIR 0AA). + fn postcode from "../../../assets/patterns/uk/contact/postcode.toml" +); +shipped_pattern!( + /// UK vehicle registration numbers (current, prefix, and + /// suffix eras). + fn vehicle_registration from "../../../assets/patterns/uk/vehicle/registration.toml" +); +shipped_pattern!( + /// UK passport numbers (post-2015 format). Weak score; relies + /// on the context-keyword boost. + fn passport from "../../../assets/patterns/uk/identity/passport.toml" +); + +/// Every UK-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + nhs(), + nino(), + driving_licence(), + postcode(), + vehicle_registration(), + passport(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/us.rs b/crates/nvisy-pattern/src/shipped/patterns/us.rs new file mode 100644 index 00000000..7f3215a8 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/us.rs @@ -0,0 +1,64 @@ +//! United States — patterns scoped to US jurisdictional formats. + +use crate::Regex; +use crate::__shipped_pattern as shipped_pattern; + +shipped_pattern!( + /// US bank routing numbers (ABA RTN, mod-10 validated). + fn bank_routing from "../../../assets/patterns/us/finance/bank_routing.toml" +); +shipped_pattern!( + /// US Social Security numbers (AAA-GG-SSSS). + fn ssn from "../../../assets/patterns/us/identity/ssn.toml" +); +shipped_pattern!( + /// US driver's license numbers (state-shape union). + fn drivers_license from "../../../assets/patterns/us/identity/drivers_license.toml" +); +shipped_pattern!( + /// US passport numbers. + fn passport from "../../../assets/patterns/us/identity/passport.toml" +); +shipped_pattern!( + /// US ZIP and ZIP+4 postal codes. + fn postal_code from "../../../assets/patterns/us/identity/postal_code.toml" +); +shipped_pattern!( + /// US Individual Taxpayer Identification Number (ITIN). + fn itin from "../../../assets/patterns/us/identity/itin.toml" +); +shipped_pattern!( + /// US National Provider Identifier (NPI, Luhn-on-80840 validated). + fn npi from "../../../assets/patterns/us/health/npi.toml" +); +shipped_pattern!( + /// US Medicare Beneficiary Identifier (MBI). + fn mbi from "../../../assets/patterns/us/health/mbi.toml" +); +shipped_pattern!( + /// US generic bank account number (8-17 digits, no checksum). + /// Very weak score; relies on the context-keyword boost. + fn bank_account from "../../../assets/patterns/us/finance/bank_account.toml" +); +shipped_pattern!( + /// US DEA registration number (medical license, + /// checksum-validated). + fn medical_license from "../../../assets/patterns/us/health/medical_license.toml" +); + +/// Every US-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + bank_routing(), + ssn(), + drivers_license(), + passport(), + postal_code(), + itin(), + npi(), + mbi(), + bank_account(), + medical_license(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/world.rs b/crates/nvisy-pattern/src/shipped/patterns/world.rs new file mode 100644 index 00000000..d4f928a2 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/world.rs @@ -0,0 +1,106 @@ +//! Universal patterns — apply regardless of jurisdiction. + +use crate::Regex; +use crate::__shipped_pattern as shipped_pattern; + +shipped_pattern!( + /// Email address (RFC-loose). + fn email from "../../../assets/patterns/world/contact/email.toml" +); +shipped_pattern!( + /// International phone numbers. + fn phone from "../../../assets/patterns/world/contact/phone.toml" +); +shipped_pattern!( + /// URLs (HTTP/HTTPS/FTP). + fn url from "../../../assets/patterns/world/contact/url.toml" +); + +shipped_pattern!( + /// AWS access key IDs. + fn aws_key from "../../../assets/patterns/world/credentials/aws_key.toml" +); +shipped_pattern!( + /// Heuristic generic API key. + fn generic_api_key from "../../../assets/patterns/world/credentials/generic_api_key.toml" +); +shipped_pattern!( + /// GitHub personal access tokens. + fn github_token from "../../../assets/patterns/world/credentials/github_token.toml" +); +shipped_pattern!( + /// PEM-formatted private keys. + fn private_key from "../../../assets/patterns/world/credentials/private_key.toml" +); +shipped_pattern!( + /// Stripe live/test secret keys. + fn stripe_key from "../../../assets/patterns/world/credentials/stripe_key.toml" +); + +shipped_pattern!( + /// Bitcoin (legacy + bech32) addresses. + fn bitcoin_address from "../../../assets/patterns/world/finance/bitcoin_address.toml" +); +shipped_pattern!( + /// Credit-card numbers, Luhn-validated. + fn credit_card from "../../../assets/patterns/world/finance/credit_card.toml" +); +shipped_pattern!( + /// Ethereum addresses. + fn ethereum_address from "../../../assets/patterns/world/finance/ethereum_address.toml" +); +shipped_pattern!( + /// International Bank Account Numbers. + fn iban from "../../../assets/patterns/world/finance/iban.toml" +); +shipped_pattern!( + /// SWIFT / BIC codes. + fn swift_code from "../../../assets/patterns/world/finance/swift_code.toml" +); + +shipped_pattern!( + /// IPv4 addresses. + fn ipv4 from "../../../assets/patterns/world/network/ipv4.toml" +); +shipped_pattern!( + /// IPv6 addresses. + fn ipv6 from "../../../assets/patterns/world/network/ipv6.toml" +); +shipped_pattern!( + /// MAC (Ethernet) addresses. + fn mac_address from "../../../assets/patterns/world/network/mac_address.toml" +); + +shipped_pattern!( + /// Date of birth in common written formats. + fn date_of_birth from "../../../assets/patterns/world/personal/date_of_birth.toml" +); +shipped_pattern!( + /// Date + time stamps in ISO-like formats. + fn datetime from "../../../assets/patterns/world/personal/datetime.toml" +); + +/// Every world-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + email(), + phone(), + url(), + aws_key(), + generic_api_key(), + github_token(), + private_key(), + stripe_key(), + bitcoin_address(), + credit_card(), + ethereum_address(), + iban(), + swift_code(), + ipv4(), + ipv6(), + mac_address(), + date_of_birth(), + datetime(), + ] +} diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index 46395652..4eb8d853 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -7,10 +7,13 @@ //! out structurally-suspect false positives that a regex alone //! can't. //! -//! [`ValidatorRegistry::builtin`] ships with [`luhn`], [`iban`], -//! [`ssn`], [`phone`], and [`date`]. Each validator is also -//! re-exported as a free function so consumers can compose a -//! custom registry without taking the full set. +//! [`ValidatorRegistry::builtin`] ships universal validators +//! ([`luhn`], [`iban`], [`phone`], [`date`]) plus jurisdiction- +//! scoped sets re-exported from [`us`] (`"us.ssn"`, +//! `"us.aba_routing"`, `"us.npi"`, `"us.dea_number"`) and [`uk`] +//! (`"uk.nhs"`, `"uk.nino"`). Each validator is also re-exported +//! as a free function so consumers can compose a custom registry +//! without taking the full set. //! //! [`Variant`]: crate::Variant //! [`Regex`]: crate::Regex @@ -19,7 +22,9 @@ mod date; mod iban; mod luhn; mod phone; -mod ssn; + +pub mod uk; +pub mod us; use std::borrow::Cow; use std::collections::HashMap; @@ -29,7 +34,6 @@ pub use self::date::date; pub use self::iban::iban; pub use self::luhn::luhn; pub use self::phone::phone; -pub use self::ssn::ssn; /// Post-match validator returning whether a matched string is /// structurally valid. @@ -74,16 +78,28 @@ impl ValidatorRegistry { Self::default() } - /// Construct a registry pre-loaded with the built-in - /// validators: [`luhn`], [`iban`], [`ssn`], [`phone`], [`date`]. + /// Construct a registry pre-loaded with the shipped built-in + /// validators. + /// + /// Universal keys: `"luhn"`, `"iban"`, `"phone"`, `"date"`. + /// + /// US-scoped: `"us.ssn"`, `"us.aba_routing"`, `"us.npi"`, + /// `"us.dea_number"`. + /// + /// UK-scoped: `"uk.nhs"`, `"uk.nino"`. #[must_use] pub fn builtin() -> Self { Self::empty() .with("luhn", luhn) .with("iban", iban) - .with("ssn", ssn) .with("phone", phone) .with("date", date) + .with("us.ssn", us::ssn) + .with("us.aba_routing", us::aba_routing) + .with("us.npi", us::npi) + .with("us.dea_number", us::dea_number) + .with("uk.nhs", uk::nhs) + .with("uk.nino", uk::nino) } /// Register `validator` under `name`, overwriting any previous diff --git a/crates/nvisy-pattern/src/validators/uk/mod.rs b/crates/nvisy-pattern/src/validators/uk/mod.rs new file mode 100644 index 00000000..37eadd66 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/uk/mod.rs @@ -0,0 +1,12 @@ +//! UK-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"uk.nhs"`, `"uk.nino"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod nhs; +mod nino; + +pub use self::nhs::nhs; +pub use self::nino::nino; diff --git a/crates/nvisy-pattern/src/validators/uk/nhs.rs b/crates/nvisy-pattern/src/validators/uk/nhs.rs new file mode 100644 index 00000000..582683e4 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/uk/nhs.rs @@ -0,0 +1,67 @@ +//! UK NHS number checksum validator. +//! +//! See `assets/NOTICE.md` for third-party attribution. + +/// Return `true` if `value` is a valid 10-digit UK NHS number. +/// +/// The NHS algorithm multiplies each of the 10 digits by descending +/// weights `[10, 9, 8, …, 1]` and accepts the number when the sum +/// is divisible by 11. Equivalent to checking that the last digit +/// equals `(11 - (weighted_sum_of_first_9 % 11)) % 11`, rejecting +/// the special case where the expected check digit would be 10. +/// +/// Whitespace and `-` separators are stripped before validation, +/// so `"943 476 5919"`, `"943-476-5919"`, and `"9434765919"` are +/// all equivalent inputs. +pub fn nhs(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .map(|c| c.to_digit(10)) + .collect::>>() + .unwrap_or_default(); + if digits.len() != 10 { + return false; + } + let total: u32 = digits + .iter() + .zip((1..=10).rev()) + .map(|(d, w)| d * w) + .sum(); + total.is_multiple_of(11) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_known_numbers() { + // Test number commonly used in NHS sandboxes. + assert!(nhs("9434765919")); + // Spaces and dashes are stripped. + assert!(nhs("943 476 5919")); + assert!(nhs("943-476-5919")); + } + + #[test] + fn invalid_check_digit() { + // Wrong final digit fails the mod-11 check. + assert!(!nhs("9434765918")); + assert!(!nhs("9434765910")); + } + + #[test] + fn rejects_non_digit_payload() { + // Embedded letters can never become a 10-digit checksum + // input. + assert!(!nhs("ABC4765919")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nhs("123")); + assert!(!nhs("12345678901")); + assert!(!nhs("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/uk/nino.rs b/crates/nvisy-pattern/src/validators/uk/nino.rs new file mode 100644 index 00000000..0ff86604 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/uk/nino.rs @@ -0,0 +1,57 @@ +//! UK National Insurance Number prefix validator. +//! +//! Reserved-prefix exclusion lives here in the validator because +//! Rust's `regex` crate does not support look-around. + +/// Return `true` when `value`'s leading two-letter prefix is not +/// a reserved NINO prefix. +/// +/// Reserved prefixes (case-insensitive): `BG`, `GB`, `NK`, `KN`, +/// `NT`, `TN`, `ZZ`. The check is structural only — it does not +/// confirm the trailing suffix letter or any HMRC issuance state. +pub fn nino(value: &str) -> bool { + let prefix: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace()) + .take(2) + .collect(); + if prefix.len() != 2 || !prefix.chars().all(|c| c.is_ascii_alphabetic()) { + return false; + } + let upper = prefix.to_ascii_uppercase(); + !matches!( + upper.as_str(), + "BG" | "GB" | "NK" | "KN" | "NT" | "TN" | "ZZ" + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_valid_prefix() { + assert!(nino("AB123456C")); + assert!(nino("JK 12 34 56 A")); + } + + #[test] + fn rejects_reserved_prefixes() { + for reserved in ["BG", "GB", "NK", "KN", "NT", "TN", "ZZ"] { + let value = format!("{reserved}123456A"); + assert!(!nino(&value), "{reserved} must be rejected"); + } + } + + #[test] + fn rejection_is_case_insensitive() { + assert!(!nino("bg123456A")); + assert!(!nino("Zz123456A")); + } + + #[test] + fn rejects_non_alpha_prefix() { + assert!(!nino("12345678A")); + assert!(!nino("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/us/aba_routing.rs b/crates/nvisy-pattern/src/validators/us/aba_routing.rs new file mode 100644 index 00000000..9ea3042d --- /dev/null +++ b/crates/nvisy-pattern/src/validators/us/aba_routing.rs @@ -0,0 +1,64 @@ +//! US ABA routing number checksum validator. + +/// Return `true` if `value` is a valid 9-digit ABA RTN. +/// +/// The ABA checksum sums the 9 digits with cyclic weights +/// `[3, 7, 1]` and accepts the number when the total is +/// divisible by 10. +/// +/// Whitespace and `-` separators are stripped before validation, +/// so `"121000358"`, `"1210-0035-8"`, and `"121 000 358"` are +/// equivalent inputs. +pub fn aba_routing(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .map(|c| c.to_digit(10)) + .collect::>>() + .unwrap_or_default(); + if digits.len() != 9 { + return false; + } + let weights = [3, 7, 1, 3, 7, 1, 3, 7, 1]; + let total: u32 = digits.iter().zip(weights).map(|(d, w)| d * w).sum(); + total.is_multiple_of(10) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_known_numbers() { + // Wells Fargo SF (verified test vector). + assert!(aba_routing("121000358")); + // JPMorgan Chase NY. + assert!(aba_routing("021000021")); + // Citibank NY. + assert!(aba_routing("021000089")); + } + + #[test] + fn strips_separators() { + assert!(aba_routing("121-000-358")); + assert!(aba_routing("121 000 358")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!aba_routing("121000359")); + assert!(!aba_routing("000000001")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!aba_routing("12100035")); + assert!(!aba_routing("1210003580")); + assert!(!aba_routing("")); + } + + #[test] + fn rejects_non_digit_payload() { + assert!(!aba_routing("12100035A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/us/dea_number.rs b/crates/nvisy-pattern/src/validators/us/dea_number.rs new file mode 100644 index 00000000..ecb8a68d --- /dev/null +++ b/crates/nvisy-pattern/src/validators/us/dea_number.rs @@ -0,0 +1,83 @@ +//! US DEA (Drug Enforcement Administration) registration number +//! checksum validator. +//! +//! See `assets/NOTICE.md` for third-party attribution. + +/// Return `true` if `value` is a valid DEA registration number. +/// +/// DEA numbers are 9 characters: two letters (the registration +/// type and the surname initial) followed by seven digits, where +/// the last digit is a checksum. +/// +/// The check takes the odd-position digits `d1, d3, d5` and the +/// even-position digits `d2, d4, d6`, then verifies that +/// `(sum(odd) + 2 * sum(even)) % 10 == d7`. +/// +/// Whitespace and `-` separators are stripped before validation. +pub fn dea_number(value: &str) -> bool { + let cleaned: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + if cleaned.len() != 9 { + return false; + } + let mut chars = cleaned.chars(); + let first = chars.next().unwrap(); + let second = chars.next().unwrap(); + if !first.is_ascii_alphabetic() || !second.is_ascii_alphabetic() { + return false; + } + let digits: Vec = chars.map(|c| c.to_digit(10)).collect::>>().unwrap_or_default(); + if digits.len() != 7 { + return false; + } + let sum_odd = digits[0] + digits[2] + digits[4]; + let sum_even = digits[1] + digits[3] + digits[5]; + let expected = (sum_odd + 2 * sum_even) % 10; + expected == digits[6] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_known_dea_numbers() { + // AB1234563: odd = 1+3+5 = 9, even = 2+4+6 = 12, + // (9 + 24) % 10 = 3 → matches d7 = 3. + assert!(dea_number("AB1234563")); + // BC9876562: odd = 9+7+5 = 21, even = 8+6+6 = 20, + // (21 + 40) % 10 = 1 → mismatch with d7 = 2. Let me pick + // a passing one. AF3456788: odd = 3+5+7 = 15, even = + // 4+6+8 = 18, (15 + 36) % 10 = 1 → mismatch d7 = 8. + // Easier: BB0000000 → odd = 0+0+0 = 0, even = 0+0+0 = 0, + // d7 = 0. Valid. + assert!(dea_number("BB0000000")); + } + + #[test] + fn strips_separators() { + assert!(dea_number("AB-12-34563")); + assert!(dea_number("AB 12 34563")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!dea_number("AB1234560")); + assert!(!dea_number("AB1234565")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!dea_number("AB123")); + assert!(!dea_number("AB12345630")); + assert!(!dea_number("")); + } + + #[test] + fn rejects_non_letter_prefix() { + assert!(!dea_number("123456789")); + assert!(!dea_number("A21234563")); + } +} diff --git a/crates/nvisy-pattern/src/validators/us/mod.rs b/crates/nvisy-pattern/src/validators/us/mod.rs new file mode 100644 index 00000000..60b1d300 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/us/mod.rs @@ -0,0 +1,16 @@ +//! US-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"us.ssn"`, `"us.aba_routing"`, etc. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod aba_routing; +mod dea_number; +mod npi; +mod ssn; + +pub use self::aba_routing::aba_routing; +pub use self::dea_number::dea_number; +pub use self::npi::npi; +pub use self::ssn::ssn; diff --git a/crates/nvisy-pattern/src/validators/us/npi.rs b/crates/nvisy-pattern/src/validators/us/npi.rs new file mode 100644 index 00000000..533a6608 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/us/npi.rs @@ -0,0 +1,66 @@ +//! US National Provider Identifier (NPI) checksum validator. +//! +//! See `assets/NOTICE.md` for third-party attribution. + +use super::super::luhn::luhn; + +/// Return `true` if `value` is a valid 10-digit US NPI. +/// +/// The CMS algorithm prepends the constant `"80840"` to the +/// 10-digit identifier and runs the standard Luhn checksum on +/// the resulting 15-digit string. +/// +/// Whitespace and `-` separators are stripped before validation. +pub fn npi(value: &str) -> bool { + let digits: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + if digits.len() != 10 || !digits.chars().all(|c| c.is_ascii_digit()) { + return false; + } + // Reject all-same-digit bodies (e.g. `1111111111`); they pass + // Luhn but are not real provider numbers. + let body = &digits[..9]; + if body.chars().all(|c| c == body.chars().next().unwrap()) { + return false; + } + let prefixed = format!("80840{digits}"); + luhn(&prefixed) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn valid_known_npi_number() { + // Test vector validated against the CMS Luhn-on-80840 algorithm. + assert!(npi("1234567893")); + } + + #[test] + fn strips_separators() { + assert!(npi("1234-567-893")); + assert!(npi("1234 567 893")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!npi("1234567890")); + assert!(!npi("1234567899")); + } + + #[test] + fn rejects_degenerate_all_same_digits() { + assert!(!npi("1111111111")); + assert!(!npi("2222222222")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!npi("123456789")); + assert!(!npi("12345678901")); + assert!(!npi("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/ssn.rs b/crates/nvisy-pattern/src/validators/us/ssn.rs similarity index 89% rename from crates/nvisy-pattern/src/validators/ssn.rs rename to crates/nvisy-pattern/src/validators/us/ssn.rs index 223408eb..d31cc66e 100644 --- a/crates/nvisy-pattern/src/validators/ssn.rs +++ b/crates/nvisy-pattern/src/validators/us/ssn.rs @@ -12,7 +12,7 @@ /// This is a format check only — not a verification against SSA /// records. pub fn ssn(value: &str) -> bool { - let parts: Vec<&str> = value.split('-').collect(); + let parts: Vec<&str> = value.split(['-', ' ', '.']).collect(); if parts.len() != 3 { return false; } @@ -42,6 +42,12 @@ mod tests { assert!(ssn("899-99-9999")); } + #[test] + fn accepts_space_and_dot_separators() { + assert!(ssn("123 45 6789")); + assert!(ssn("123.45.6789")); + } + #[test] fn invalid_area_zero() { assert!(!ssn("000-45-6789")); diff --git a/crates/nvisy-pattern/testdata/inputs/identity.txt b/crates/nvisy-pattern/testdata/inputs/identity.txt index 045a6238..cdcbd923 100644 --- a/crates/nvisy-pattern/testdata/inputs/identity.txt +++ b/crates/nvisy-pattern/testdata/inputs/identity.txt @@ -3,9 +3,12 @@ PATIENT INTAKE FORM Full name: Jane Smith Date of birth: 1985-03-14 SSN: 123-45-6789 +ITIN: 912-71-1234 Driver license: D123-4567-8901 Passport (US): A12345678 Mailing address: 742 Evergreen Terrace Springfield, OR 97477-1234 +Provider NPI: 1234567893 +Medicare MBI: 1EG4-TE5-MK73 Insurance card number on file (see attached). diff --git a/crates/nvisy-pattern/testdata/inputs/uk.txt b/crates/nvisy-pattern/testdata/inputs/uk.txt new file mode 100644 index 00000000..05dc0b42 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/uk.txt @@ -0,0 +1,12 @@ +Patient handover for Mrs A. Patel. + +Personal details: + - NHS number: 943 476 5919 + - NINO: AB123456C + - Driving licence (DVLA): MORGA753116SM9IJ + - Address: 10 Downing Street, London SW1A 2AA + +Vehicle: BMW 3 Series, registration AB51 ABC, V5C on file. + +Please update the patient record (national health service form 4) +and bill the National Insurance reference shown above. diff --git a/crates/nvisy-pattern/tests/shipped_detection.rs b/crates/nvisy-pattern/tests/shipped_detection.rs index 32d0ac13..b6b82cc6 100644 --- a/crates/nvisy-pattern/tests/shipped_detection.rs +++ b/crates/nvisy-pattern/tests/shipped_detection.rs @@ -79,6 +79,24 @@ async fn identity_inputs_yield_expected_entities() { builtins::DATE_OF_BIRTH.label_ref(), "1985-03-14", ); + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "912-71-1234", + ); + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "1234567893", + ); + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "1EG4-TE5-MK73", + ); } #[tokio::test] @@ -195,3 +213,38 @@ async fn personal_inputs_yield_expected_entities() { "expected at least one Language" ); } + +#[tokio::test] +async fn uk_inputs_yield_expected_entities() { + let (text, entities) = scan(include_str!("../testdata/inputs/uk.txt")).await; + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "943 476 5919", + ); + assert_match( + &text, + &entities, + builtins::NATIONAL_INSURANCE_NUMBER.label_ref(), + "AB123456C", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "MORGA753116SM9IJ", + ); + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "SW1A 2AA", + ); + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "AB51 ABC", + ); +} diff --git a/crates/nvisy-toolkit/tests/fixtures/registries.rs b/crates/nvisy-toolkit/tests/fixtures/registries.rs index 30098801..2056a1d0 100644 --- a/crates/nvisy-toolkit/tests/fixtures/registries.rs +++ b/crates/nvisy-toolkit/tests/fixtures/registries.rs @@ -50,13 +50,14 @@ where .insert_label(builtins::PAYMENT_CARD.label_ref(), Mask::stars()) } -/// Standard dedup params: a `0.5` confidence threshold drops the -/// low-confidence ISO-639 short-code matches from the languages -/// dictionary (see `assets/dictionaries/general/languages.toml`'s -/// `column_scores`). +/// Standard dedup params: a `0.35` confidence threshold sized +/// for our shipped patterns' baseline (most regex-only matches +/// land in 0.1–0.5 before context boost); a tighter threshold +/// would drop legitimate weak-pattern matches the context layer +/// is expected to lift. pub fn dedup_params() -> LayerParams { LayerParams { - confidence_threshold: Some(ConfidenceThreshold::new(0.5).unwrap()), + confidence_threshold: Some(ConfidenceThreshold::new(0.35).unwrap()), ..LayerParams::default() } } From ae6a6409734623da2093d5c215f26b26c2af4d48 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 16 Jun 2026 03:25:50 +0200 Subject: [PATCH 10/14] test(pattern): split shipped_detection into per-region e2e binaries Restructure pattern crate end-to-end tests: - Replace single tests/shipped_detection.rs with three per-region binaries: tests/builtin.rs (5 world tests), tests/builtin_us.rs (3 US tests), tests/builtin_uk.rs (3 UK tests). 11 tests total. - Move shared scan + assert_match + assert_label_present helpers to tests/fixtures/mod.rs, declared via mod fixtures; from each binary. Both helpers carry #[track_caller] for better failure attribution. - Reshape testdata/inputs/ to mirror the asset tree: - world fixtures move from monolithic domain files into inputs/{contact,credentials,finance,network,personal}.txt - inputs/us/{identity,finance,health}.txt - inputs/uk/{identity,contact,vehicle}.txt (split from old uk.txt) - Each test scans one fixture, asserting substring + label matches against a recognizer loaded with every shipped pattern and dictionary via build_context_enhanced. - builtin_uk_identity asserts NATIONALITY (world dictionary firing on "British") to keep assert_label_present reachable across all three binaries. Co-Authored-By: Claude Opus 4.7 --- .../uk.txt => builtin/uk/identity.txt} | 0 .../nvisy-pattern/testdata/inputs/contact.txt | 9 +- .../testdata/inputs/credentials.txt | 10 +- .../nvisy-pattern/testdata/inputs/finance.txt | 9 +- .../testdata/inputs/uk/contact.txt | 7 + .../testdata/inputs/uk/identity.txt | 8 + .../testdata/inputs/uk/vehicle.txt | 8 + .../testdata/inputs/us/finance.txt | 10 + .../testdata/inputs/us/health.txt | 15 ++ .../testdata/inputs/{ => us}/identity.txt | 10 +- crates/nvisy-pattern/tests/builtin.rs | 143 ++++++++++ crates/nvisy-pattern/tests/builtin_uk.rs | 65 +++++ crates/nvisy-pattern/tests/builtin_us.rs | 85 ++++++ crates/nvisy-pattern/tests/fixtures/mod.rs | 60 +++++ .../nvisy-pattern/tests/shipped_detection.rs | 250 ------------------ 15 files changed, 420 insertions(+), 269 deletions(-) rename crates/nvisy-pattern/testdata/{inputs/uk.txt => builtin/uk/identity.txt} (100%) create mode 100644 crates/nvisy-pattern/testdata/inputs/uk/contact.txt create mode 100644 crates/nvisy-pattern/testdata/inputs/uk/identity.txt create mode 100644 crates/nvisy-pattern/testdata/inputs/uk/vehicle.txt create mode 100644 crates/nvisy-pattern/testdata/inputs/us/finance.txt create mode 100644 crates/nvisy-pattern/testdata/inputs/us/health.txt rename crates/nvisy-pattern/testdata/inputs/{ => us}/identity.txt (50%) create mode 100644 crates/nvisy-pattern/tests/builtin.rs create mode 100644 crates/nvisy-pattern/tests/builtin_uk.rs create mode 100644 crates/nvisy-pattern/tests/builtin_us.rs create mode 100644 crates/nvisy-pattern/tests/fixtures/mod.rs delete mode 100644 crates/nvisy-pattern/tests/shipped_detection.rs diff --git a/crates/nvisy-pattern/testdata/inputs/uk.txt b/crates/nvisy-pattern/testdata/builtin/uk/identity.txt similarity index 100% rename from crates/nvisy-pattern/testdata/inputs/uk.txt rename to crates/nvisy-pattern/testdata/builtin/uk/identity.txt diff --git a/crates/nvisy-pattern/testdata/inputs/contact.txt b/crates/nvisy-pattern/testdata/inputs/contact.txt index 975ccea5..fb6af992 100644 --- a/crates/nvisy-pattern/testdata/inputs/contact.txt +++ b/crates/nvisy-pattern/testdata/inputs/contact.txt @@ -1,9 +1,10 @@ Hi team, -Please reach out to alice.johnson@example.com if you have questions about -the proposal. For urgent matters, call me at +1 (415) 555-0142 or my office -line 415.555.0188. Background materials live at https://docs.example.com/proposal -and the secondary mirror is http://backup.example.org/proposal-v2. +Please reach out to alice.johnson@example.com if you have +questions about the proposal. For urgent matters, call me at ++1 (415) 555-0142 or my office line 415.555.0188. Background +materials live at https://docs.example.com/proposal and the +secondary mirror is http://backup.example.org/proposal-v2. Best, Bob diff --git a/crates/nvisy-pattern/testdata/inputs/credentials.txt b/crates/nvisy-pattern/testdata/inputs/credentials.txt index 100d9eb8..94b8c803 100644 --- a/crates/nvisy-pattern/testdata/inputs/credentials.txt +++ b/crates/nvisy-pattern/testdata/inputs/credentials.txt @@ -1,8 +1,10 @@ # config.env (DO NOT COMMIT) -# Note: all credentials below are obvious-fake placeholders chosen to -# exercise the credentials patterns without tripping push-protection -# scanners. AWS uses AWS's own documented example key; Stripe uses -# `sk_test_` (test-mode prefix); GitHub uses an EXAMPLE-suffixed token. +# +# Note: all credentials below are obvious-fake placeholders chosen +# to exercise the credential patterns without tripping +# push-protection scanners. AWS uses AWS's own documented example +# key; Stripe uses `sk_test_` (test-mode prefix); GitHub uses an +# EXAMPLE-suffixed token. AWS_ACCESS_KEY_ID=AKIAIOSFODNN7EXAMPLE GITHUB_TOKEN=ghp_EXAMPLE00000abcdefghijklmnopqrstuvwx diff --git a/crates/nvisy-pattern/testdata/inputs/finance.txt b/crates/nvisy-pattern/testdata/inputs/finance.txt index 367d0dd7..17d2d32d 100644 --- a/crates/nvisy-pattern/testdata/inputs/finance.txt +++ b/crates/nvisy-pattern/testdata/inputs/finance.txt @@ -1,12 +1,11 @@ Wire transfer authorization --------------------------- -Beneficiary: Acme Industries Ltd. -IBAN: GB29 NWBK 6016 1331 9268 19 -SWIFT/BIC: NWBKGB2L -Routing (US correspondent): 021000021 +Beneficiary: Acme Industries Ltd. +IBAN: GB29 NWBK 6016 1331 9268 19 +SWIFT/BIC: NWBKGB2L Charge card on file (backup): 4539 1488 0343 6467 -Settlement: in US Dollar, optionally EUR. +Settlement: in US Dollar, optionally EUR. Crypto reimbursement options: - Bitcoin: 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa diff --git a/crates/nvisy-pattern/testdata/inputs/uk/contact.txt b/crates/nvisy-pattern/testdata/inputs/uk/contact.txt new file mode 100644 index 00000000..9ce5dff9 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/uk/contact.txt @@ -0,0 +1,7 @@ +Correspondence address for Royal Mail delivery: + + 10 Downing Street + London SW1A 2AA + United Kingdom + +Use the postcode above when filing the mailing label. diff --git a/crates/nvisy-pattern/testdata/inputs/uk/identity.txt b/crates/nvisy-pattern/testdata/inputs/uk/identity.txt new file mode 100644 index 00000000..52b36edb --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/uk/identity.txt @@ -0,0 +1,8 @@ +Patient handover for Mrs A. Patel — refer to the NHS patient +file, the National Insurance reference, the DVLA driving licence, +and her HM Passport Office passport. + +NHS number: 943 476 5919 +NI Number (NINO): AB123456C +Driving licence (DVLA): MORGA753116SM9IJ +British passport: AB1234567 diff --git a/crates/nvisy-pattern/testdata/inputs/uk/vehicle.txt b/crates/nvisy-pattern/testdata/inputs/uk/vehicle.txt new file mode 100644 index 00000000..b6d248c3 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/uk/vehicle.txt @@ -0,0 +1,8 @@ +Vehicle on file: + + Make / model: BMW 3 Series + Registration plate: AB51 ABC + DVLA V5C log book: yes + MOT current: yes + +Insurance to be billed against the registered vehicle above. diff --git a/crates/nvisy-pattern/testdata/inputs/us/finance.txt b/crates/nvisy-pattern/testdata/inputs/us/finance.txt new file mode 100644 index 00000000..aa0619b9 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/us/finance.txt @@ -0,0 +1,10 @@ +ACH transfer instruction (US correspondent) +------------------------------------------- + +Beneficiary bank: Wells Fargo Bank, N.A. +Routing (ABA RTN): 121000358 +Checking account: 0123456789012 +Wire memo: consultancy invoice 0042 + +Use the checking account number above for the ACH debit and the +routing number above for the bank deposit instruction. diff --git a/crates/nvisy-pattern/testdata/inputs/us/health.txt b/crates/nvisy-pattern/testdata/inputs/us/health.txt new file mode 100644 index 00000000..00ac7cb2 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/us/health.txt @@ -0,0 +1,15 @@ +Provider claim — Medicare crossover +----------------------------------- + +Rendering provider: + Name: Dr. Robert Hayes, MD + NPI: 1234567893 + DEA license: BB0000000 + Taxonomy code: 207R00000X + +Beneficiary: + Name: Henry Davies + Medicare MBI: 1EG4-TE5-MK73 + +Claim submitted for evaluation under Medicare Part B; the DEA +registration above authorizes prescribing of controlled substances. diff --git a/crates/nvisy-pattern/testdata/inputs/identity.txt b/crates/nvisy-pattern/testdata/inputs/us/identity.txt similarity index 50% rename from crates/nvisy-pattern/testdata/inputs/identity.txt rename to crates/nvisy-pattern/testdata/inputs/us/identity.txt index cdcbd923..ea9e0c77 100644 --- a/crates/nvisy-pattern/testdata/inputs/identity.txt +++ b/crates/nvisy-pattern/testdata/inputs/us/identity.txt @@ -2,13 +2,11 @@ PATIENT INTAKE FORM Full name: Jane Smith Date of birth: 1985-03-14 -SSN: 123-45-6789 -ITIN: 912-71-1234 +Social security: 123-45-6789 +ITIN (taxpayer): 912-71-1234 Driver license: D123-4567-8901 -Passport (US): A12345678 +US passport: A12345678 Mailing address: 742 Evergreen Terrace Springfield, OR 97477-1234 -Provider NPI: 1234567893 -Medicare MBI: 1EG4-TE5-MK73 -Insurance card number on file (see attached). +Confirm insurance card on file (see attached). diff --git a/crates/nvisy-pattern/tests/builtin.rs b/crates/nvisy-pattern/tests/builtin.rs new file mode 100644 index 00000000..2025384d --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin.rs @@ -0,0 +1,143 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! cross-jurisdiction (`world`) fixtures. +//! +//! Each test scans one `testdata/inputs/.txt` fixture +//! through a recognizer wired with every shipped pattern and +//! dictionary, then asserts the entities a real document of that +//! domain is expected to surface (substring + label, not +//! byte-offset, so fixtures and regexes can evolve without +//! brittle churn). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::EMAIL_ADDRESS.label_ref(), + "alice.johnson@example.com", + ); + assert_match( + &text, + &entities, + builtins::URL.label_ref(), + "https://docs.example.com/proposal", + ); + assert_match( + &text, + &entities, + builtins::URL.label_ref(), + "http://backup.example.org/proposal-v2", + ); + assert_label_present(&entities, builtins::PHONE_NUMBER.label_ref()); +} + +#[tokio::test] +async fn builtin_credentials() { + let (_, entities) = scan(include_str!("../testdata/inputs/credentials.txt")).await; + assert_label_present(&entities, builtins::API_KEY.label_ref()); + assert_label_present(&entities, builtins::PRIVATE_KEY.label_ref()); + assert_label_present(&entities, builtins::AUTH_TOKEN.label_ref()); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::IBAN.label_ref(), + "GB29 NWBK 6016 1331 9268 19", + ); + assert_match( + &text, + &entities, + builtins::SWIFT_CODE.label_ref(), + "NWBKGB2L", + ); + assert_match( + &text, + &entities, + builtins::PAYMENT_CARD.label_ref(), + "4539 1488 0343 6467", + ); + assert_match( + &text, + &entities, + builtins::CRYPTO_ADDRESS.label_ref(), + "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa", + ); + assert_match( + &text, + &entities, + builtins::CRYPTO_ADDRESS.label_ref(), + "0x742d35Cc6634C0532925a3b844Bc9e7595f6E842", + ); + // Currency dictionaries pick up `USD`, `EUR`, `Tether`, `USDC`. + assert_label_present(&entities, builtins::CURRENCY.label_ref()); +} + +#[tokio::test] +async fn builtin_network() { + let (text, entities) = scan(include_str!("../testdata/inputs/network.txt")).await; + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "192.168.1.42", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "10.0.0.7", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "203.0.113.55", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "2001:0db8:85a3:0000:0000:8a2e:0370:7334", + ); + assert_match( + &text, + &entities, + builtins::MAC_ADDRESS.label_ref(), + "00:1A:2B:3C:4D:5E", + ); + assert_match( + &text, + &entities, + builtins::MAC_ADDRESS.label_ref(), + "3C-22-FB-A1-B2-C3", + ); +} + +#[tokio::test] +async fn builtin_personal() { + let (text, entities) = scan(include_str!("../testdata/inputs/personal.txt")).await; + assert_match( + &text, + &entities, + builtins::DATE_OF_BIRTH.label_ref(), + "04/22/1979", + ); + assert_match( + &text, + &entities, + builtins::DATE_TIME.label_ref(), + "2024-06-15T09:30:00Z", + ); + assert_label_present(&entities, builtins::NATIONALITY.label_ref()); + assert_label_present(&entities, builtins::LANGUAGE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_uk.rs b/crates/nvisy-pattern/tests/builtin_uk.rs new file mode 100644 index 00000000..e166c73a --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_uk.rs @@ -0,0 +1,65 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! UK-jurisdiction fixtures (`testdata/inputs/uk/.txt`). +//! +//! Each test scans one UK fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real UK document of that domain is expected to +//! surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/uk/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "943 476 5919", + ); + assert_match( + &text, + &entities, + builtins::NATIONAL_INSURANCE_NUMBER.label_ref(), + "AB123456C", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "MORGA753116SM9IJ", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "AB1234567", + ); + // World nationality dictionary activates on UK text ("British"). + assert_label_present(&entities, builtins::NATIONALITY.label_ref()); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/uk/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "SW1A 2AA", + ); +} + +#[tokio::test] +async fn builtin_vehicle() { + let (text, entities) = scan(include_str!("../testdata/inputs/uk/vehicle.txt")).await; + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "AB51 ABC", + ); +} diff --git a/crates/nvisy-pattern/tests/builtin_us.rs b/crates/nvisy-pattern/tests/builtin_us.rs new file mode 100644 index 00000000..24b714cb --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_us.rs @@ -0,0 +1,85 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! US-jurisdiction fixtures (`testdata/inputs/us/.txt`). +//! +//! Each test scans one US fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real US document of that domain is expected to +//! surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/us/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "123-45-6789", + ); + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "912-71-1234", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "D123-4567-8901", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "A12345678", + ); + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "97477-1234", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/us/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::BANK_ROUTING.label_ref(), + "121000358", + ); + // bank_account is `\b\d{8,17}\b` with score 0.05 — it requires + // a context-keyword boost (e.g. `account`) to clear the + // confidence threshold. The fixture provides one. + assert_label_present(&entities, builtins::BANK_ACCOUNT.label_ref()); +} + +#[tokio::test] +async fn builtin_health() { + let (text, entities) = scan(include_str!("../testdata/inputs/us/health.txt")).await; + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "1234567893", + ); + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "1EG4-TE5-MK73", + ); + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "BB0000000", + ); +} diff --git a/crates/nvisy-pattern/tests/fixtures/mod.rs b/crates/nvisy-pattern/tests/fixtures/mod.rs new file mode 100644 index 00000000..1b27792e --- /dev/null +++ b/crates/nvisy-pattern/tests/fixtures/mod.rs @@ -0,0 +1,60 @@ +//! Shared helpers for the `builtin_*` end-to-end test suites. +//! +//! Each per-region test file (`tests/builtin_world.rs`, +//! `tests/builtin_us.rs`, `tests/builtin_uk.rs`) declares this +//! module via `mod fixtures;` and calls [`scan`] + the +//! `assert_*` helpers to express expectations against a single +//! shared [`PatternRecognizer`] built from every shipped pattern +//! and dictionary. + +use nvisy_core::entity::{Entity, EntityLabelRef}; +use nvisy_core::modality::{Text, TextData}; +use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; +use nvisy_pattern::PatternRecognizer; + +pub async fn scan(text: &str) -> (String, Vec>) { + let recognizer = PatternRecognizer::builder() + .with_builtin_patterns() + .with_builtin_dictionaries() + .build_context_enhanced() + .expect("shipped recognizer builds"); + let input = RecognizerInput::new(TextData::new(text.to_owned())); + let entities = recognizer + .recognize(&input) + .await + .expect("shipped recognize") + .entities; + (text.to_owned(), entities) +} + +#[track_caller] +pub fn assert_match( + text: &str, + entities: &[Entity], + label: EntityLabelRef, + needle: &str, +) { + let hit = entities + .iter() + .any(|e| e.label == label && &text[e.location.start..e.location.end] == needle); + assert!( + hit, + "expected `{needle}` as {label:?}; got: {:?}", + entities + .iter() + .map(|e| (e.label.clone(), &text[e.location.start..e.location.end])) + .collect::>() + ); +} + +#[track_caller] +pub fn assert_label_present(entities: &[Entity], label: EntityLabelRef) { + assert!( + entities.iter().any(|e| e.label == label), + "expected at least one {label:?} entity; got labels: {:?}", + entities + .iter() + .map(|e| e.label.clone()) + .collect::>() + ); +} diff --git a/crates/nvisy-pattern/tests/shipped_detection.rs b/crates/nvisy-pattern/tests/shipped_detection.rs deleted file mode 100644 index b6b82cc6..00000000 --- a/crates/nvisy-pattern/tests/shipped_detection.rs +++ /dev/null @@ -1,250 +0,0 @@ -//! End-to-end: load every shipped pattern + dictionary into one -//! [`PatternRecognizer`], scan each `testdata/inputs/*.txt`, and -//! assert the entities a real document of that category is expected -//! to surface (by substring + kind). -//! -//! These are intentionally substring-based rather than offset-based -//! so the fixtures and shipped regexes can both evolve without -//! brittle byte-position churn. - -use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; -use nvisy_core::modality::{Text, TextData}; -use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; -use nvisy_pattern::PatternRecognizer; - -async fn scan(text: &str) -> (String, Vec>) { - let recognizer = PatternRecognizer::builder() - .with_builtin_patterns() - .with_builtin_dictionaries() - .build_context_enhanced() - .expect("shipped recognizer builds"); - let input = RecognizerInput::new(TextData::new(text.to_owned())); - let entities = recognizer - .recognize(&input) - .await - .expect("shipped recognize") - .entities; - (text.to_owned(), entities) -} - -fn assert_match(text: &str, entities: &[Entity], label: EntityLabelRef, needle: &str) { - let hit = entities - .iter() - .any(|e| e.label == label && &text[e.location.start..e.location.end] == needle); - assert!( - hit, - "expected `{needle}` as {label:?}; got: {:?}", - entities - .iter() - .map(|e| (e.label.clone(), &text[e.location.start..e.location.end])) - .collect::>() - ); -} - -#[tokio::test] -async fn contact_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/contact.txt")).await; - assert_match( - &text, - &entities, - builtins::EMAIL_ADDRESS.label_ref(), - "alice.johnson@example.com", - ); - assert_match( - &text, - &entities, - builtins::URL.label_ref(), - "https://docs.example.com/proposal", - ); - assert_match( - &text, - &entities, - builtins::URL.label_ref(), - "http://backup.example.org/proposal-v2", - ); -} - -#[tokio::test] -async fn identity_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/identity.txt")).await; - assert_match( - &text, - &entities, - builtins::GOVERNMENT_ID.label_ref(), - "123-45-6789", - ); - assert_match( - &text, - &entities, - builtins::DATE_OF_BIRTH.label_ref(), - "1985-03-14", - ); - assert_match( - &text, - &entities, - builtins::TAX_ID.label_ref(), - "912-71-1234", - ); - assert_match( - &text, - &entities, - builtins::MEDICAL_ID.label_ref(), - "1234567893", - ); - assert_match( - &text, - &entities, - builtins::MEDICAL_ID.label_ref(), - "1EG4-TE5-MK73", - ); -} - -#[tokio::test] -async fn finance_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/finance.txt")).await; - assert_match( - &text, - &entities, - builtins::PAYMENT_CARD.label_ref(), - "4539 1488 0343 6467", - ); - assert_match( - &text, - &entities, - builtins::CRYPTO_ADDRESS.label_ref(), - "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa", - ); - assert_match( - &text, - &entities, - builtins::CRYPTO_ADDRESS.label_ref(), - "0x742d35Cc6634C0532925a3b844Bc9e7595f6E842", - ); - // Currency and cryptocurrency dictionaries emit `Currency`; - // pick up `USD`, `EUR`, `Tether`, `USDC`, … - assert!( - entities - .iter() - .any(|e| e.label == builtins::CURRENCY.label_ref()), - "expected at least one currency/crypto dictionary hit" - ); -} - -#[tokio::test] -async fn credentials_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/credentials.txt")).await; - assert_match( - &text, - &entities, - builtins::API_KEY.label_ref(), - "AKIAIOSFODNN7EXAMPLE", - ); - // Private-key pattern matches the BEGIN header. - assert!( - entities - .iter() - .any(|e| e.label == builtins::PRIVATE_KEY.label_ref()), - "expected at least one PrivateKey entity" - ); -} - -#[tokio::test] -async fn network_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/network.txt")).await; - assert_match( - &text, - &entities, - builtins::IP_ADDRESS.label_ref(), - "192.168.1.42", - ); - assert_match( - &text, - &entities, - builtins::IP_ADDRESS.label_ref(), - "10.0.0.7", - ); - assert_match( - &text, - &entities, - builtins::IP_ADDRESS.label_ref(), - "203.0.113.55", - ); - assert_match( - &text, - &entities, - builtins::IP_ADDRESS.label_ref(), - "2001:0db8:85a3:0000:0000:8a2e:0370:7334", - ); - assert_match( - &text, - &entities, - builtins::MAC_ADDRESS.label_ref(), - "00:1A:2B:3C:4D:5E", - ); -} - -#[tokio::test] -async fn personal_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/personal.txt")).await; - assert_match( - &text, - &entities, - builtins::DATE_OF_BIRTH.label_ref(), - "04/22/1979", - ); - assert_match( - &text, - &entities, - builtins::DATE_TIME.label_ref(), - "2024-06-15T09:30:00Z", - ); - // Nationality and language dictionaries pick up `Italian`, - // `Canadian`, `English`, `Spanish`. - assert!( - entities - .iter() - .any(|e| e.label == builtins::NATIONALITY.label_ref()), - "expected at least one Nationality" - ); - assert!( - entities - .iter() - .any(|e| e.label == builtins::LANGUAGE.label_ref()), - "expected at least one Language" - ); -} - -#[tokio::test] -async fn uk_inputs_yield_expected_entities() { - let (text, entities) = scan(include_str!("../testdata/inputs/uk.txt")).await; - assert_match( - &text, - &entities, - builtins::MEDICAL_ID.label_ref(), - "943 476 5919", - ); - assert_match( - &text, - &entities, - builtins::NATIONAL_INSURANCE_NUMBER.label_ref(), - "AB123456C", - ); - assert_match( - &text, - &entities, - builtins::DRIVERS_LICENSE.label_ref(), - "MORGA753116SM9IJ", - ); - assert_match( - &text, - &entities, - builtins::POSTAL_CODE.label_ref(), - "SW1A 2AA", - ); - assert_match( - &text, - &entities, - builtins::LICENSE_PLATE.label_ref(), - "AB51 ABC", - ); -} From 2137db585ba62ae3b64cc3aa923858c1af057bbe Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 16 Jun 2026 06:57:01 +0200 Subject: [PATCH 11/14] feat(pattern,context): Presidio-aligned audit fixes + tabular context propagation Tier 1 (correctness bugs): - world/iban regex: extend middle groups from \d{4} to [A-Z0-9]{4} and separator \s? to [\s\-]?; accepts IBANs with letters past position 8 (UK NWBK, IE, MT, MK, GI) and hyphenated forms that the mod-97 validator was already prepared to handle. - world/private_key: match the full BEGIN..END PEM block instead of only the header line; add ENCRYPTED PRIVATE KEY, PGP, SSH2, and PuTTY-User-Key-File-{2,3} variants. - us/medical_license: add \b anchors to both DEA variants; prior pattern matched inside longer alphanumeric tokens. - uk.nino validator: reject O as the position-0 letter (HMRC reserved); the character class blocks D/F/I/Q/U/V but allows O via the j-p range. - us/passport: add Presidio's context = [passport, passport#, travel document, us passport, united states passport] so the 0.1-base pattern can boost above threshold. - us/postal_code: drop score 0.5 -> 0.1, add context, ship a us.postal_code validator that rejects 00000. Tier 2 (coverage + scoring): - world/bitcoin_address: split legacy (Base58) from Bech32; bump Bech32 cap {25,39}->{25,59} for Taproot (bc1p...). Add a crypto.btc validator using bs58::decode_check. - world/credit_card: add Mastercard 2-series (2221-2720); drop score 0.5 -> 0.3 to match Presidio's deliberate baseline that expects context boost to do the rest. - world/aws_key: broaden access-key ID prefix to also catch ASIA/AIDA/AROA/ANPA/AGPA/AIPA; add a second variant for the 40-char secret access key; ship Presidio-style context. - world/github_token: add github_pat_[A-Z0-9_]{82} variant for the fine-grained PAT format introduced in 2022. - world/generic_api_key: accept whitespace separator alongside [:=] so `Authorization: Bearer ` matches. - uk/driving_licence + uk/vehicle/registration: add the Presidio validators we'd left on the table (99999 surname rejection, age-ID range 02-29 / 51-79 for current-format plates). Validator infrastructure: - world/phone: replace the regex+length validator with a phonenumber-crate-backed region-aware validator. The validator parses E.164 directly and falls back to the caller-specified country (via RecognizerInput.country) when present. Introduces a workspace-wide phonenumber = 0.3 dependency. - ValidatorRegistry::with_simple convenience for the ten context-free validators; with() stays the canonical entry point for ctx-aware validators (only phone today). Context-enhancer architecture: - Add RecognizerInput.context_hints: Vec for out-of-band context strings (CSV column headers, JSON keys, log field names) the caller wants treated as in-context. - nvisy-context::Enhancer::enhance now takes a Context bundle (text + tokens + language + hints) instead of four loose arguments. The hint path runs as a fallback when the in-text word window doesn't fire; at most one boost per rule per entity. - LiftedFromText in nvisy-toolkit gains chunk_hints; Tabular surfaces column_name as a hint so a `card` column header lifts a per-cell CC=0.3 match to ~0.65 via the existing boost pipeline (no synthetic score patching to clear the threshold). nvisy-context module split: - enhancer.rs -> enhancer/{mod, context, window}.rs - matcher.rs -> matching/{mod, matcher, lemma}.rs - tokens.rs + wrapper.rs -> io/{mod, tokens, wrapper}.rs - Public surface (Context, Enhancer, BoostRule, KeywordMatcher, SubstringMatcher, LemmaMatcher, ContextEnhanced, Token, Tokens) stays at the crate root via re-exports. - Drop 3 redundant enhancer tests (suffix-symmetry duplicate, unicode-too-distant duplicate, token-window symmetry); keep the 13 unique behaviors plus 2 new hint-path tests. Known gap: html_codec_e2e payment_card assertion fails because HTML chunks at text-node boundary and `4111...` loses the surrounding "payment card" context. The fix requires moving chunk_hints from LiftedFromText onto the Handler trait and overriding it on HtmlHandler to emit parent-element text as a hint. Tracked as follow-up. Co-Authored-By: Claude Opus 4.7 --- Cargo.lock | 159 ++++++++- Cargo.toml | 4 + crates/nvisy-context/src/enhancer/context.rs | 68 ++++ .../src/{enhancer.rs => enhancer/mod.rs} | 324 ++++++------------ crates/nvisy-context/src/enhancer/window.rs | 118 +++++++ crates/nvisy-context/src/io/mod.rs | 20 ++ crates/nvisy-context/src/{ => io}/tokens.rs | 0 crates/nvisy-context/src/{ => io}/wrapper.rs | 16 +- crates/nvisy-context/src/lib.rs | 12 +- crates/nvisy-context/src/matcher.rs | 134 -------- crates/nvisy-context/src/matching/lemma.rs | 65 ++++ crates/nvisy-context/src/matching/matcher.rs | 70 ++++ crates/nvisy-context/src/matching/mod.rs | 19 + crates/nvisy-core/src/recognition/input.rs | 17 + crates/nvisy-pattern/Cargo.toml | 6 + .../patterns/uk/identity/driving_licence.toml | 1 + .../patterns/uk/vehicle/registration.toml | 1 + .../patterns/us/health/medical_license.toml | 4 +- .../assets/patterns/us/identity/passport.toml | 7 + .../patterns/us/identity/postal_code.toml | 12 +- .../patterns/world/credentials/aws_key.toml | 22 +- .../world/credentials/generic_api_key.toml | 6 +- .../world/credentials/github_token.toml | 8 + .../world/credentials/private_key.toml | 13 +- .../world/finance/bitcoin_address.toml | 13 +- .../patterns/world/finance/credit_card.toml | 7 +- .../assets/patterns/world/finance/iban.toml | 2 +- .../src/recognition/recognizer.rs | 8 +- crates/nvisy-pattern/src/validators/btc.rs | 53 +++ crates/nvisy-pattern/src/validators/mod.rs | 99 ++++-- crates/nvisy-pattern/src/validators/phone.rs | 109 ++---- .../src/validators/uk/driving_licence.rs | 71 ++++ crates/nvisy-pattern/src/validators/uk/mod.rs | 7 +- .../nvisy-pattern/src/validators/uk/nino.rs | 20 +- .../src/validators/uk/vehicle_registration.rs | 69 ++++ crates/nvisy-pattern/src/validators/us/mod.rs | 2 + .../src/validators/us/postal_code.rs | 40 +++ .../testdata/builtin/uk/identity.txt | 12 - crates/nvisy-toolkit/src/detection/chunks.rs | 21 +- 39 files changed, 1145 insertions(+), 494 deletions(-) create mode 100644 crates/nvisy-context/src/enhancer/context.rs rename crates/nvisy-context/src/{enhancer.rs => enhancer/mod.rs} (63%) create mode 100644 crates/nvisy-context/src/enhancer/window.rs create mode 100644 crates/nvisy-context/src/io/mod.rs rename crates/nvisy-context/src/{ => io}/tokens.rs (100%) rename crates/nvisy-context/src/{ => io}/wrapper.rs (84%) delete mode 100644 crates/nvisy-context/src/matcher.rs create mode 100644 crates/nvisy-context/src/matching/lemma.rs create mode 100644 crates/nvisy-context/src/matching/matcher.rs create mode 100644 crates/nvisy-context/src/matching/mod.rs create mode 100644 crates/nvisy-pattern/src/validators/btc.rs create mode 100644 crates/nvisy-pattern/src/validators/uk/driving_licence.rs create mode 100644 crates/nvisy-pattern/src/validators/uk/vehicle_registration.rs create mode 100644 crates/nvisy-pattern/src/validators/us/postal_code.rs delete mode 100644 crates/nvisy-pattern/testdata/builtin/uk/identity.txt diff --git a/Cargo.lock b/Cargo.lock index f987251d..2d9b1229 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -292,6 +292,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +dependencies = [ + "critical-section", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -524,6 +533,16 @@ dependencies = [ "alloc-stdlib", ] +[[package]] +name = "bs58" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf88ba1141d185c399bee5288d850d63b8369520c1eafc32a0430b5b6c287bf4" +dependencies = [ + "sha2 0.10.9", + "tinyvec", +] + [[package]] name = "built" version = "0.8.1" @@ -706,6 +725,15 @@ dependencies = [ "cc", ] +[[package]] +name = "cobs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" +dependencies = [ + "thiserror", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -841,6 +869,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -1234,6 +1268,18 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" +[[package]] +name = "embedded-io" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" + +[[package]] +name = "embedded-io" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" + [[package]] name = "encoding_rs" version = "0.8.35" @@ -1684,6 +1730,15 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.14.5" @@ -1711,6 +1766,20 @@ version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "heapless" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version", + "serde", + "spin", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.4.1" @@ -2389,6 +2458,12 @@ dependencies = [ "include_dir", ] +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -2469,6 +2544,15 @@ dependencies = [ "weezl", ] +[[package]] +name = "lru-cache" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e24f1ad8321ca0e8a1e0ac13f23cb668e6f5466c2c57319f6a5cf1cc8e3b1c" +dependencies = [ + "linked-hash-map", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -3043,11 +3127,13 @@ version = "0.1.0" dependencies = [ "aho-corasick", "async-trait", + "bs58", "csv", "derive_builder", "derive_more", "nvisy-context", "nvisy-core", + "phonenumber", "regex", "serde", "tokio", @@ -3126,6 +3212,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" +[[package]] +name = "oncemutex" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44d11de466f4a3006fe8a5e7ec84e93b79c70cb992ae0aa0eb631ad2df8abfe2" + [[package]] name = "opaque-debug" version = "0.3.1" @@ -3285,6 +3377,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phonenumber" +version = "0.3.9+9.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9114f9c1683dd09c5f4fa024c89fdad783eaae21d3d52dd23ddaaffa29ffb168" +dependencies = [ + "either", + "fnv", + "nom 7.1.3", + "once_cell", + "postcard", + "quick-xml", + "regex", + "regex-cache", + "serde", + "serde_derive", + "strum 0.27.2", + "thiserror", +] + [[package]] name = "pin-project" version = "1.1.13" @@ -3369,6 +3481,19 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "postcard" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" +dependencies = [ + "cobs", + "embedded-io 0.4.0", + "embedded-io 0.6.1", + "heapless", + "serde", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -3508,6 +3633,15 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", +] + [[package]] name = "quick_cache" version = "0.6.23" @@ -3801,7 +3935,7 @@ dependencies = [ "aho-corasick", "memchr", "regex-automata", - "regex-syntax", + "regex-syntax 0.8.10", ] [[package]] @@ -3812,7 +3946,19 @@ checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax", + "regex-syntax 0.8.10", +] + +[[package]] +name = "regex-cache" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f7b62d69743b8b94f353b6b7c3deb4c5582828328bcb8d5fedf214373808793" +dependencies = [ + "lru-cache", + "oncemutex", + "regex", + "regex-syntax 0.6.29", ] [[package]] @@ -3821,6 +3967,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + [[package]] name = "regex-syntax" version = "0.8.10" @@ -4622,6 +4774,9 @@ name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" +dependencies = [ + "strum_macros 0.27.2", +] [[package]] name = "strum" diff --git a/Cargo.toml b/Cargo.toml index 77a0822a..ee8f2000 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -100,6 +100,10 @@ lingua = { version = "1.8", default-features = false, features = ["english"] } unicode-segmentation = { version = "1.13", features = [] } unicode-normalization = { version = "0.1", features = [] } +# Checksum / encoding +bs58 = { version = "0.5", features = ["check"] } +phonenumber = { version = "0.3", default-features = false } + # Tabular document parsing csv = { version = "1.0", features = [] } calamine = { version = "0.35", features = [] } diff --git a/crates/nvisy-context/src/enhancer/context.rs b/crates/nvisy-context/src/enhancer/context.rs new file mode 100644 index 00000000..85278b1c --- /dev/null +++ b/crates/nvisy-context/src/enhancer/context.rs @@ -0,0 +1,68 @@ +//! [`Context`]: per-call inputs bundled for [`Enhancer::enhance`]. +//! +//! [`Enhancer::enhance`]: super::Enhancer::enhance + +use nvisy_core::primitive::LanguageTag; + +use crate::io::Token; + +/// Per-call inputs bundled together so the enhancer's internal +/// methods don't drag a long argument list through every layer. +/// +/// All fields borrow; the value lives for the duration of one +/// [`Enhancer::enhance`] call. +/// +/// [`Enhancer::enhance`]: super::Enhancer::enhance +#[derive(Clone, Copy)] +pub struct Context<'a> { + /// Full text the entities' byte offsets index into. + pub text: &'a str, + /// Optional token artifact produced by an upstream NLP + /// engine. When present, word-window counting walks the token + /// stream; when absent, words are derived from `text` via + /// Unicode word segmentation. + pub tokens: Option<&'a [Token]>, + /// Per-call language hint. `None` means "unknown" — every + /// per-language rule applies as a permissive fallback. + pub language: Option<&'a LanguageTag>, + /// Out-of-band context strings (CSV column headers, JSON + /// object keys, log field names) the caller wants treated as + /// in-context. Each hint is fed to the matcher as its own + /// one-string window; a hit boosts the entity exactly as an + /// in-text keyword would. + pub hints: &'a [String], +} + +impl<'a> Context<'a> { + /// Construct a context with just the source text; every + /// other field defaults to empty. + pub fn new(text: &'a str) -> Self { + Self { + text, + tokens: None, + language: None, + hints: &[], + } + } + + /// Attach a token artifact. + #[must_use] + pub fn with_tokens(mut self, tokens: &'a [Token]) -> Self { + self.tokens = Some(tokens); + self + } + + /// Attach a language hint. + #[must_use] + pub fn with_language(mut self, language: &'a LanguageTag) -> Self { + self.language = Some(language); + self + } + + /// Attach out-of-band hint strings. + #[must_use] + pub fn with_hints(mut self, hints: &'a [String]) -> Self { + self.hints = hints; + self + } +} diff --git a/crates/nvisy-context/src/enhancer.rs b/crates/nvisy-context/src/enhancer/mod.rs similarity index 63% rename from crates/nvisy-context/src/enhancer.rs rename to crates/nvisy-context/src/enhancer/mod.rs index 176efcc9..fa744f83 100644 --- a/crates/nvisy-context/src/enhancer.rs +++ b/crates/nvisy-context/src/enhancer/mod.rs @@ -5,16 +5,25 @@ use std::collections::HashMap; use nvisy_core::entity::{Entity, EntityLabelRef, TrailStep}; use nvisy_core::modality::Text; -use nvisy_core::primitive::LanguageTag; -use unicode_segmentation::UnicodeSegmentation; -use super::matcher::KeywordMatcher; -use super::rule::BoostRule; -use super::tokens::Token; +use crate::matching::KeywordMatcher; +use crate::rule::BoostRule; +use crate::io::Token; -/// Source name stamped onto every refinement [`TrailStep`] the -/// enhancer appends. -const TRAIL_SOURCE: &str = "context"; +mod context; +mod window; + +pub use self::context::Context; + +use self::window::{slice_tokens_around, token_span, word_window}; + +/// Source name stamped onto refinement [`TrailStep`]s the +/// enhancer appends when the in-text word window fires. +const TRAIL_SOURCE_WINDOW: &str = "context"; + +/// Source name stamped onto refinement [`TrailStep`]s the +/// enhancer appends when an out-of-band hint fires. +const TRAIL_SOURCE_HINT: &str = "context-hint"; /// Post-recognition enhancer. Holds a label-keyed [`BoostRule`] /// map plus the keyword-matching strategy, and lifts the @@ -31,8 +40,8 @@ const TRAIL_SOURCE: &str = "context"; /// the enhancer: [`SubstringMatcher`] when no upstream NLP engine /// produces tokens, [`LemmaMatcher`] when one does. /// -/// [`SubstringMatcher`]: super::SubstringMatcher -/// [`LemmaMatcher`]: super::LemmaMatcher +/// [`SubstringMatcher`]: crate::SubstringMatcher +/// [`LemmaMatcher`]: crate::LemmaMatcher pub struct Enhancer { /// Rules bucketed by label. Within one bucket, each entry is /// a distinct `(language)` scope; rules sharing the same @@ -86,67 +95,45 @@ impl Enhancer { /// Apply boost rules to `entities` in place. For each entity: /// walk every rule registered for its label whose language - /// scope applies under `language`, walk a window of + /// scope applies under `ctx.language`, walk a window of /// `prefix_words` words before and `suffix_words` words after /// the entity's location, ask the matcher whether any keyword /// fires, and on a hit lift confidence by the rule's `boost` /// (saturating at the [`Confidence`] ceiling) plus append a /// [`Refinement`] trail step. /// - /// `tokens` is the optional token artifact produced by an - /// upstream NLP engine. When present, words are counted - /// against the token stream; when absent, words are derived - /// from the source text via Unicode word segmentation. - /// - /// `language` is the per-call language hint. `None` means - /// "unknown" — every per-language rule applies as a - /// permissive fallback. + /// The in-text and hint paths are independent — at most one + /// boost per rule fires per entity (window first, hint as + /// fallback) so a rule with a long keyword list can't + /// double-dip. /// /// [`Confidence`]: nvisy_core::primitive::Confidence /// [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement - pub fn enhance( - &self, - entities: &mut [Entity], - text: &str, - tokens: Option<&[Token]>, - language: Option<&LanguageTag>, - ) { + pub fn enhance(&self, entities: &mut [Entity], ctx: &Context<'_>) { if self.rules.is_empty() { return; } for entity in entities { - self.enhance_one(entity, text, tokens, language); + self.enhance_one(entity, ctx); } } - fn enhance_one( - &self, - entity: &mut Entity, - text: &str, - tokens: Option<&[Token]>, - language: Option<&LanguageTag>, - ) { + fn enhance_one(&self, entity: &mut Entity, ctx: &Context<'_>) { let Some(bucket) = self.rules.get(&entity.label) else { return; }; for rule in bucket { - if !rule.applies_to_language(language) { + if !rule.applies_to_language(ctx.language) { continue; } if rule.keywords.is_empty() { continue; } - self.apply_rule(entity, rule, text, tokens); + self.apply_rule(entity, rule, ctx); } } - fn apply_rule( - &self, - entity: &mut Entity, - rule: &BoostRule, - text: &str, - tokens: Option<&[Token]>, - ) { + fn apply_rule(&self, entity: &mut Entity, rule: &BoostRule, ctx: &Context<'_>) { let start = entity.location.start; let end = entity.location.end; @@ -156,23 +143,32 @@ impl Enhancer { // `tokens: None`, `tokens: Some(&[])`, and the "tokens // present but none overlap the entity" case (e.g. NLP // engine only tokenized part of the document). - let token_slice = tokens + let token_slice = ctx + .tokens .map(|toks| slice_tokens_around(toks, start, end, rule.prefix_words, rule.suffix_words)) .unwrap_or(&[]); let (snippet, tokens_in_window): (&str, &[Token]) = if token_slice.is_empty() { - let snippet = word_window(text, start, end, rule.prefix_words, rule.suffix_words); + let snippet = word_window(ctx.text, start, end, rule.prefix_words, rule.suffix_words); (snippet, &[]) } else { - let snippet = token_span(text, token_slice, start, end); + let snippet = token_span(ctx.text, token_slice, start, end); (snippet, token_slice) }; - if !self + let source = if self .matcher .any_match(snippet, tokens_in_window, &rule.keywords) { + TRAIL_SOURCE_WINDOW + } else if ctx + .hints + .iter() + .any(|h| self.matcher.any_match(h, &[], &rule.keywords)) + { + TRAIL_SOURCE_HINT + } else { return; - } + }; let original = entity.confidence; let adjusted = original.saturating_add(rule.boost.get()); @@ -182,7 +178,7 @@ impl Enhancer { entity.confidence = adjusted; entity.trail.push(TrailStep::refinement( - TRAIL_SOURCE, + source, original, adjusted, format!( @@ -194,97 +190,6 @@ impl Enhancer { } } -/// Walk `prefix` words before `[start, end)` and `suffix` words -/// after, via Unicode word segmentation, and return the spanning -/// substring (including any non-word whitespace and punctuation -/// between words). The returned slice covers `[start, end)` itself -/// plus the prefix / suffix words; the entity's own bytes are -/// always inside. -fn word_window(text: &str, start: usize, end: usize, prefix: usize, suffix: usize) -> &str { - let prefix_text = &text[..start.min(text.len())]; - let suffix_text = &text[end.min(text.len())..]; - - // `unicode_word_indices` yields `(byte_offset, word_str)` for - // every "word" (alphanumeric run) in source order. Take the - // last `prefix` on the prefix side, the first `suffix` on the - // suffix side, and compute the spanning byte range. - let prefix_words: Vec<(usize, &str)> = prefix_text.unicode_word_indices().collect(); - let prefix_take = prefix_words.len().saturating_sub(prefix); - let prefix_byte = prefix_words - .get(prefix_take) - .map(|(idx, _)| *idx) - .unwrap_or(start.min(text.len())); - - let suffix_byte = if suffix == 0 { - end.min(text.len()) - } else { - suffix_text - .unicode_word_indices() - .nth(suffix - 1) - .map(|(idx, word)| end + idx + word.len()) - .unwrap_or(text.len()) - }; - - let lo = floor_char_boundary(text, prefix_byte); - let hi = ceil_char_boundary(text, suffix_byte.min(text.len())); - &text[lo..hi] -} - -fn floor_char_boundary(s: &str, mut pos: usize) -> usize { - while pos > 0 && !s.is_char_boundary(pos) { - pos -= 1; - } - pos -} - -fn ceil_char_boundary(s: &str, mut pos: usize) -> usize { - while pos < s.len() && !s.is_char_boundary(pos) { - pos += 1; - } - pos -} - -/// Slice tokens by *count*: take `prefix` tokens before the first -/// token overlapping `[start, end)` and `suffix` tokens after the -/// last. The returned slice is contiguous. -fn slice_tokens_around( - tokens: &[Token], - start: usize, - end: usize, - prefix: usize, - suffix: usize, -) -> &[Token] { - if tokens.is_empty() { - return &[]; - } - // First token whose `offset.end > start` overlaps or follows the entity. - let first_overlap = tokens.partition_point(|t| t.offset.end <= start); - // One past the last token whose `offset.start < end` overlaps the entity. - let last_overlap = tokens.partition_point(|t| t.offset.start < end); - let lo = first_overlap.saturating_sub(prefix); - let hi = (last_overlap + suffix).min(tokens.len()); - if lo >= hi { - return &[]; - } - &tokens[lo..hi] -} - -/// Spanning substring covering `tokens` plus the entity itself. -/// Used to give the matcher a contiguous text window when slicing -/// against the token stream. -/// -/// Precondition: `tokens` is non-empty. Callers must take the -/// `word_window` fallback path when their token slice is empty — -/// see `Enhancer::enhance_one`. -fn token_span<'a>(text: &'a str, tokens: &[Token], start: usize, end: usize) -> &'a str { - debug_assert!(!tokens.is_empty(), "token_span requires non-empty slice"); - let lo = tokens[0].offset.start.min(start); - let hi = tokens[tokens.len() - 1].offset.end.max(end); - let lo = floor_char_boundary(text, lo.min(text.len())); - let hi = ceil_char_boundary(text, hi.min(text.len())); - &text[lo..hi] -} - #[cfg(test)] mod tests { use nvisy_core::entity::{ @@ -352,7 +257,7 @@ mod tests { )]); let text = "Your SSN: 123-45-6789"; let mut entities = vec![entity(govid_label(), 10, 21, 0.6)]; - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert!(entities[0].confidence.get() > 0.6); assert!( entities[0] @@ -362,18 +267,6 @@ mod tests { ); } - #[test] - fn boosts_entity_when_keyword_in_suffix() { - let enhancer = enhancer(vec![rule(govid_label(), &["social"], 0, 5, 0.2)]); - let text = "123-45-6789 (social security number)"; - let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; - enhancer.enhance(&mut entities, text, None, None); - assert!( - entities[0].confidence.get() > 0.6, - "trailing keyword within suffix window should boost", - ); - } - #[test] fn suffix_zero_ignores_trailing_keyword() { // Prefix-only: trailing keyword must not boost. @@ -381,7 +274,7 @@ mod tests { let text = "123-45-6789 (social security number)"; let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert_eq!(entities[0].confidence.get(), before); } @@ -391,7 +284,7 @@ mod tests { let text = "Mr. Smith is named in the report."; let mut entities = vec![entity(person_label(), 4, 9, 0.5)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert_eq!(entities[0].confidence.get(), before); } @@ -405,7 +298,7 @@ mod tests { let xyz_end = xyz_start + "XYZ".len(); let mut entities = vec![entity(govid_label(), xyz_start, xyz_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert_eq!(entities[0].confidence.get(), before); } @@ -414,7 +307,7 @@ mod tests { let enhancer = enhancer(vec![rule(govid_label(), &["here"], 5, 5, 0.9)]); let text = "the value is right here in plain sight"; let mut entities = vec![entity(govid_label(), 16, 21, 0.95)]; - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert!((entities[0].confidence.get() - 1.0).abs() < f64::EPSILON); } @@ -438,7 +331,7 @@ mod tests { let ssn_entity_start = ssn_only.find("123").unwrap(); let ssn_entity_end = ssn_entity_start + "123-45-6789".len(); let mut from_first = vec![entity(govid_label(), ssn_entity_start, ssn_entity_end, 0.6)]; - make_enhancer().enhance(&mut from_first, ssn_only, None, None); + make_enhancer().enhance(&mut from_first, &Context::new(ssn_only)); assert!( from_first[0].confidence.get() > 0.6, "keyword `ssn` from the first rule must still boost after merge", @@ -449,7 +342,7 @@ mod tests { let tax_entity_start = taxid_only.find("987").unwrap(); let tax_entity_end = tax_entity_start + "987-65-4329".len(); let mut from_second = vec![entity(govid_label(), tax_entity_start, tax_entity_end, 0.6)]; - make_enhancer().enhance(&mut from_second, taxid_only, None, None); + make_enhancer().enhance(&mut from_second, &Context::new(taxid_only)); assert!( from_second[0].confidence.get() > 0.6, "keyword `tax id` from the second rule must still boost after merge", @@ -464,38 +357,24 @@ mod tests { let entity_start = text.find("123").unwrap(); let entity_end = entity_start + "123-45-6789".len(); let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, None, None); + enhancer.enhance(&mut entities, &Context::new(text)); assert!( entities[0].confidence.get() > 0.6, "unicode word should be reachable within 3-word prefix", ); } - #[test] - fn word_window_excludes_too_distant_unicode() { - // 2-word prefix: "café" is the 3rd word before the entity. - let enhancer = enhancer(vec![rule(govid_label(), &["café"], 2, 0, 0.2)]); - let text = "café naïve resume — 123-45-6789"; - let entity_start = text.find("123").unwrap(); - let entity_end = entity_start + "123-45-6789".len(); - let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, None, None); - assert_eq!(entities[0].confidence.get(), before); - } - #[test] fn empty_tokens_slice_matches_none_behaviour() { - // Keyword sits in the prefix word-window but outside the - // entity bytes. With the empty-slice fix, `Some(&[])` must - // not collapse the snippet to the entity bytes — it should - // fall back to the word-window path just like `None`. + // `Some(&[])` must not collapse the snippet to entity + // bytes — it should fall back to the word-window path + // just like `None`. let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); let text = "Your SSN: 123-45-6789"; let mut from_none = vec![entity(govid_label(), 10, 21, 0.6)]; let mut from_empty = vec![entity(govid_label(), 10, 21, 0.6)]; - enhancer.enhance(&mut from_none, text, None, None); - enhancer.enhance(&mut from_empty, text, Some(&[]), None); + enhancer.enhance(&mut from_none, &Context::new(text)); + enhancer.enhance(&mut from_empty, &Context::new(text).with_tokens(&[])); assert_eq!( from_none[0].confidence.get(), from_empty[0].confidence.get(), @@ -513,9 +392,7 @@ mod tests { // prefix reaches is the immediate predecessor token // "Your". The tokenizer here treats "social security" // as a single compound token outside the window, so the - // keyword "social security" must NOT fire — unlike a - // hypothetical caller that gave it the word-window path, - // which would split on whitespace. + // keyword "social security" must NOT fire. let enhancer = enhancer(vec![rule(govid_label(), &["social security"], 1, 0, 0.2)]); let text = "social security: Your 123-45-6789"; let entity_start = text.find("123").unwrap(); @@ -527,7 +404,7 @@ mod tests { ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; let before = entities[0].confidence.get(); - enhancer.enhance(&mut entities, text, Some(&tokens), None); + enhancer.enhance(&mut entities, &Context::new(text).with_tokens(&tokens)); assert_eq!( entities[0].confidence.get(), before, @@ -535,27 +412,6 @@ mod tests { ); } - #[test] - fn token_path_boosts_when_keyword_within_token_window() { - // Same tokens, 2-word prefix: now the `social security` - // token is reachable and the boost fires. - let enhancer = enhancer(vec![rule(govid_label(), &["social security"], 2, 0, 0.2)]); - let text = "social security: Your 123-45-6789"; - let entity_start = text.find("123").unwrap(); - let entity_end = entity_start + "123-45-6789".len(); - let tokens: Vec = vec![ - Token::from_text("social security", 0..15), - Token::from_text("Your", 17..21), - Token::from_text("123-45-6789", 22..33), - ]; - let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens), None); - assert!( - entities[0].confidence.get() > 0.6, - "2-word prefix should reach the `social security` token", - ); - } - #[test] fn lemma_matcher_boosts_on_morphological_variant() { // Substring matcher would miss `running` for keyword @@ -578,31 +434,22 @@ mod tests { Token::from_text("system", 41..47), ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens), None); + enhancer.enhance(&mut entities, &Context::new(text).with_tokens(&tokens)); assert!( entities[0].confidence.get() > 0.6, "lemma matcher should match `run` against the `running` token's lemma", ); - assert!( - entities[0] - .trail - .iter() - .any(|s| matches!(s.kind, TrailStepKind::Refinement)), - ); } #[test] fn tokens_with_no_overlap_fall_back_to_word_window() { // Tokens cover the first half of the document; the entity - // is in the second half, outside any token's range. - // Without the fallback the token slice would be empty and - // the snippet would collapse to entity bytes. With the - // fallback, the word-window path reaches the keyword. + // is in the second half, outside any token's range. The + // word-window path must still reach the keyword. let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); let text = "First half of the document. Your SSN: 123-45-6789"; let entity_start = text.find("123").unwrap(); let entity_end = entity_start + "123-45-6789".len(); - // Tokens that cover only the first sentence. let tokens: Vec = vec![ Token::from_text("First", 0..5), Token::from_text("half", 6..10), @@ -611,10 +458,53 @@ mod tests { Token::from_text("document", 18..26), ]; let mut entities = vec![entity(govid_label(), entity_start, entity_end, 0.6)]; - enhancer.enhance(&mut entities, text, Some(&tokens), None); + enhancer.enhance(&mut entities, &Context::new(text).with_tokens(&tokens)); assert!( entities[0].confidence.get() > 0.6, "tokens that don't overlap the entity must fall back to the word window", ); } + + #[test] + fn out_of_band_hint_boosts_when_window_is_empty() { + // Cell-only text has no surrounding context — the word + // window walk finds nothing — but the caller supplies the + // CSV column header as an out-of-band hint that contains + // a rule keyword. Confidence must lift, and the trail + // step must mark the source as `context-hint`. + let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); + let text = "123-45-6789"; + let hints = ["ssn".to_owned()]; + let mut entities = vec![entity(govid_label(), 0, 11, 0.6)]; + enhancer.enhance(&mut entities, &Context::new(text).with_hints(&hints)); + assert!( + entities[0].confidence.get() > 0.6, + "out-of-band hint matching a rule keyword must boost", + ); + assert!( + entities[0] + .trail + .iter() + .any(|s| s.source == "context-hint"), + "trail step must record the hint-source provenance", + ); + } + + #[test] + fn hint_path_is_independent_of_window_path() { + // The in-text window already fires, so the hint path + // shouldn't double-boost. Exactly one refinement step + // appears on the entity. + let enhancer = enhancer(vec![rule(govid_label(), &["ssn"], 5, 5, 0.2)]); + let text = "Your SSN: 123-45-6789"; + let hints = ["ssn".to_owned()]; + let mut entities = vec![entity(govid_label(), 10, 21, 0.6)]; + enhancer.enhance(&mut entities, &Context::new(text).with_hints(&hints)); + let refinements = entities[0] + .trail + .iter() + .filter(|s| matches!(s.kind, TrailStepKind::Refinement)) + .count(); + assert_eq!(refinements, 1, "rule must boost at most once per entity"); + } } diff --git a/crates/nvisy-context/src/enhancer/window.rs b/crates/nvisy-context/src/enhancer/window.rs new file mode 100644 index 00000000..85252fc2 --- /dev/null +++ b/crates/nvisy-context/src/enhancer/window.rs @@ -0,0 +1,118 @@ +//! Window-slicing helpers shared by [`Enhancer::apply_rule`]. +//! +//! Two coordinate systems matter here: +//! +//! - **Bytes**: source-text offsets. `word_window` walks Unicode +//! word segments to expand an entity's `[start, end)` to +//! `prefix`/`suffix` words on either side. +//! - **Tokens**: pre-tokenized stream from an upstream NLP engine. +//! `slice_tokens_around` takes a `prefix`/`suffix` count and +//! returns the contiguous token slice that covers the entity +//! plus that many neighbours. +//! +//! Both paths feed the same downstream [`KeywordMatcher`] — +//! [`token_span`] reduces a non-empty token slice back to its +//! spanning substring for matchers that operate on raw text. +//! +//! [`Enhancer::apply_rule`]: super::Enhancer +//! [`KeywordMatcher`]: crate::KeywordMatcher + +use unicode_segmentation::UnicodeSegmentation; + +use crate::io::Token; + +/// Walk `prefix` words before `[start, end)` and `suffix` words +/// after, via Unicode word segmentation, and return the spanning +/// substring (including any non-word whitespace and punctuation +/// between words). The returned slice covers `[start, end)` itself +/// plus the prefix / suffix words; the entity's own bytes are +/// always inside. +pub(super) fn word_window( + text: &str, + start: usize, + end: usize, + prefix: usize, + suffix: usize, +) -> &str { + let prefix_text = &text[..start.min(text.len())]; + let suffix_text = &text[end.min(text.len())..]; + + // `unicode_word_indices` yields `(byte_offset, word_str)` for + // every "word" (alphanumeric run) in source order. Take the + // last `prefix` on the prefix side, the first `suffix` on the + // suffix side, and compute the spanning byte range. + let prefix_words: Vec<(usize, &str)> = prefix_text.unicode_word_indices().collect(); + let prefix_take = prefix_words.len().saturating_sub(prefix); + let prefix_byte = prefix_words + .get(prefix_take) + .map(|(idx, _)| *idx) + .unwrap_or(start.min(text.len())); + + let suffix_byte = if suffix == 0 { + end.min(text.len()) + } else { + suffix_text + .unicode_word_indices() + .nth(suffix - 1) + .map(|(idx, word)| end + idx + word.len()) + .unwrap_or(text.len()) + }; + + let lo = floor_char_boundary(text, prefix_byte); + let hi = ceil_char_boundary(text, suffix_byte.min(text.len())); + &text[lo..hi] +} + +/// Slice tokens by *count*: take `prefix` tokens before the first +/// token overlapping `[start, end)` and `suffix` tokens after the +/// last. The returned slice is contiguous. +pub(super) fn slice_tokens_around( + tokens: &[Token], + start: usize, + end: usize, + prefix: usize, + suffix: usize, +) -> &[Token] { + if tokens.is_empty() { + return &[]; + } + // First token whose `offset.end > start` overlaps or follows the entity. + let first_overlap = tokens.partition_point(|t| t.offset.end <= start); + // One past the last token whose `offset.start < end` overlaps the entity. + let last_overlap = tokens.partition_point(|t| t.offset.start < end); + let lo = first_overlap.saturating_sub(prefix); + let hi = (last_overlap + suffix).min(tokens.len()); + if lo >= hi { + return &[]; + } + &tokens[lo..hi] +} + +/// Spanning substring covering `tokens` plus the entity itself. +/// Used to give the matcher a contiguous text window when slicing +/// against the token stream. +/// +/// Precondition: `tokens` is non-empty. Callers must take the +/// [`word_window`] fallback path when their token slice is empty. +pub(super) fn token_span<'a>(text: &'a str, tokens: &[Token], start: usize, end: usize) -> &'a str { + debug_assert!(!tokens.is_empty(), "token_span requires non-empty slice"); + let lo = tokens[0].offset.start.min(start); + let hi = tokens[tokens.len() - 1].offset.end.max(end); + let lo = floor_char_boundary(text, lo.min(text.len())); + let hi = ceil_char_boundary(text, hi.min(text.len())); + &text[lo..hi] +} + +fn floor_char_boundary(s: &str, mut pos: usize) -> usize { + while pos > 0 && !s.is_char_boundary(pos) { + pos -= 1; + } + pos +} + +fn ceil_char_boundary(s: &str, mut pos: usize) -> usize { + while pos < s.len() && !s.is_char_boundary(pos) { + pos += 1; + } + pos +} diff --git a/crates/nvisy-context/src/io/mod.rs b/crates/nvisy-context/src/io/mod.rs new file mode 100644 index 00000000..df3b9fb0 --- /dev/null +++ b/crates/nvisy-context/src/io/mod.rs @@ -0,0 +1,20 @@ +//! Wiring between the [`Enhancer`] and the [`EntityRecognizer`] +//! pipeline. +//! +//! - [`Token`] / [`Tokens`] is the shared NLP token artifact the +//! enhancer reads off `RecognizerInput.artifacts`. +//! - [`ContextEnhanced`] wraps any [`EntityRecognizer`] so +//! the enhancer runs automatically after the inner recognizer's +//! pass. +//! +//! All three types are re-exported at the crate root. +//! +//! [`Enhancer`]: crate::Enhancer +//! [`EntityRecognizer`]: nvisy_core::recognition::EntityRecognizer +//! [`EntityRecognizer`]: nvisy_core::recognition::EntityRecognizer + +mod tokens; +mod wrapper; + +pub use self::tokens::{Token, Tokens}; +pub use self::wrapper::ContextEnhanced; diff --git a/crates/nvisy-context/src/tokens.rs b/crates/nvisy-context/src/io/tokens.rs similarity index 100% rename from crates/nvisy-context/src/tokens.rs rename to crates/nvisy-context/src/io/tokens.rs diff --git a/crates/nvisy-context/src/wrapper.rs b/crates/nvisy-context/src/io/wrapper.rs similarity index 84% rename from crates/nvisy-context/src/wrapper.rs rename to crates/nvisy-context/src/io/wrapper.rs index c9d415ec..6329bf5d 100644 --- a/crates/nvisy-context/src/wrapper.rs +++ b/crates/nvisy-context/src/io/wrapper.rs @@ -22,7 +22,8 @@ use nvisy_core::Result; use nvisy_core::modality::Text; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; -use super::{Enhancer, Tokens}; +use super::Tokens; +use crate::{Context, Enhancer}; /// Wraps an [`EntityRecognizer`] with a post-recognition /// [`Enhancer`] pass. Implements [`EntityRecognizer`] so @@ -68,11 +69,14 @@ where if self.enhancer.is_empty() { return Ok(output); } - let text = input.data.text.as_str(); - let tokens = input.artifacts.get::().map(Tokens::as_slice); - let language = input.language.as_ref(); - self.enhancer - .enhance(&mut output.entities, text, tokens, language); + let mut ctx = Context::new(input.data.text.as_str()).with_hints(&input.context_hints); + if let Some(tokens) = input.artifacts.get::() { + ctx = ctx.with_tokens(tokens.as_slice()); + } + if let Some(language) = input.language.as_ref() { + ctx = ctx.with_language(language); + } + self.enhancer.enhance(&mut output.entities, &ctx); Ok(output) } } diff --git a/crates/nvisy-context/src/lib.rs b/crates/nvisy-context/src/lib.rs index 244e113c..7c470d06 100644 --- a/crates/nvisy-context/src/lib.rs +++ b/crates/nvisy-context/src/lib.rs @@ -3,13 +3,11 @@ #![doc = include_str!("../README.md")] mod enhancer; -mod matcher; +mod io; +mod matching; mod rule; -mod tokens; -mod wrapper; -pub use self::enhancer::Enhancer; -pub use self::matcher::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; +pub use self::enhancer::{Context, Enhancer}; +pub use self::io::{ContextEnhanced, Token, Tokens}; +pub use self::matching::{KeywordMatcher, LemmaMatcher, SubstringMatcher}; pub use self::rule::{BoostRule, DEFAULT_BOOST, DEFAULT_PREFIX_WORDS, DEFAULT_SUFFIX_WORDS}; -pub use self::tokens::{Token, Tokens}; -pub use self::wrapper::ContextEnhanced; diff --git a/crates/nvisy-context/src/matcher.rs b/crates/nvisy-context/src/matcher.rs deleted file mode 100644 index 06beef22..00000000 --- a/crates/nvisy-context/src/matcher.rs +++ /dev/null @@ -1,134 +0,0 @@ -//! [`KeywordMatcher`] strategy + the two shipped implementations. -//! -//! - [`SubstringMatcher`] — ASCII case-insensitive substring search -//! over the raw text window. The fallback when no token artifact -//! is present on `RecognizerInput.artifacts`. -//! - [`LemmaMatcher`] — matches keywords against lemmatized tokens -//! the upstream NLP engine stamped on `RecognizerInput.artifacts` -//! as a [`Tokens`] entry. Recognizes morphological variants -//! ("running" → "run", "SSNs" → "ssn") substring matching misses. -//! -//! Both implementations are stateless; the [`Enhancer`] owns one -//! as a configured strategy. -//! -//! [`Tokens`]: super::Tokens -//! [`Enhancer`]: super::Enhancer - -use hipstr::HipStr; - -use super::Token; - -/// Decide whether any keyword from `keywords` fires within the -/// candidate region around an entity match. -/// -/// The strategy slot that lets the enhancer swap raw substring -/// matching for lemma-aware matching (or a third-party -/// fuzzy/word-boundary implementation) without changing its core -/// pipeline. -/// -/// Implementations receive both a raw `window` slice of the source -/// text (for substring strategies) and the `tokens` covering that -/// same range (for token/lemma strategies). Either or both may be -/// ignored; `tokens` is empty when no NLP engine produced a token -/// artifact. -pub trait KeywordMatcher: Send + Sync { - /// `true` if at least one keyword from `keywords` appears in - /// the input. - fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool; -} - -/// ASCII case-insensitive substring matcher. The default — -/// runs whenever no token artifact was stamped on -/// `RecognizerInput.artifacts`, or whenever the caller explicitly -/// picks raw matching. -/// -/// Fast, allocation-light, permissive: the keyword `"email"` fires -/// inside `"MyEmailAddress"`. Ignores the `tokens` argument. -#[derive(Debug, Clone, Copy, Default)] -pub struct SubstringMatcher; - -impl KeywordMatcher for SubstringMatcher { - fn any_match(&self, window: &str, _tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { - let lowered = window.to_ascii_lowercase(); - keywords - .iter() - .any(|kw| lowered.contains(kw.as_str().to_ascii_lowercase().as_str())) - } -} - -/// Lemma-aware matcher. Compares each lemma in `tokens` against -/// the keyword list with ASCII case-insensitive equality. -/// -/// Falls back to [`SubstringMatcher`] semantics when `tokens` is -/// empty (no shared NLP artifact was produced) so the enhancer -/// runs uniformly regardless of whether the upstream pass emitted -/// tokens. -/// -/// Recognizes morphological variants the substring matcher cannot: -/// `"running" → "run"`, `"dogs" → "dog"`, `"SSNs" → "ssn"`. Cost -/// is one lowercase per keyword + one lowercase per lemma per -/// match attempt. -#[derive(Debug, Clone, Copy, Default)] -pub struct LemmaMatcher; - -impl KeywordMatcher for LemmaMatcher { - fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { - if tokens.is_empty() { - return SubstringMatcher.any_match(window, tokens, keywords); - } - let lowered_keywords: Vec = keywords - .iter() - .map(|k| k.as_str().to_ascii_lowercase()) - .collect(); - tokens.iter().any(|tok| { - let lemma = tok.lemma.as_str().to_ascii_lowercase(); - lowered_keywords.contains(&lemma) - }) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn kws(items: &[&'static str]) -> Vec> { - items.iter().copied().map(HipStr::from).collect() - } - - #[test] - fn substring_matches_case_insensitively() { - let m = SubstringMatcher; - assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); - assert!(m.any_match( - "the SOCIAL SECURITY number", - &[], - &kws(&["social security"]) - )); - assert!(!m.any_match("nothing here", &[], &kws(&["ssn"]))); - } - - #[test] - fn substring_is_permissive() { - let m = SubstringMatcher; - assert!(m.any_match("MyEmailAddress", &[], &kws(&["email"]))); - } - - #[test] - fn lemma_matches_morph_variants() { - let tokens = vec![ - Token::from_text("the", 0..3), - Token::from_text("running", 4..11).with_lemma("run"), - Token::from_text("dogs", 12..16).with_lemma("dog"), - ]; - let m = LemmaMatcher; - assert!(m.any_match("", &tokens, &kws(&["run"]))); - assert!(m.any_match("", &tokens, &kws(&["dog"]))); - assert!(!m.any_match("", &tokens, &kws(&["cat"]))); - } - - #[test] - fn lemma_falls_back_to_substring_without_tokens() { - let m = LemmaMatcher; - assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); - } -} diff --git a/crates/nvisy-context/src/matching/lemma.rs b/crates/nvisy-context/src/matching/lemma.rs new file mode 100644 index 00000000..0e1d5b5e --- /dev/null +++ b/crates/nvisy-context/src/matching/lemma.rs @@ -0,0 +1,65 @@ +//! Lemma-aware [`KeywordMatcher`] implementation. + +use hipstr::HipStr; + +use super::matcher::{KeywordMatcher, SubstringMatcher}; +use crate::io::Token; + +/// Lemma-aware matcher. Compares each lemma in `tokens` against +/// the keyword list with ASCII case-insensitive equality. +/// +/// Falls back to [`SubstringMatcher`] semantics when `tokens` is +/// empty (no shared NLP artifact was produced) so the enhancer +/// runs uniformly regardless of whether the upstream pass emitted +/// tokens. +/// +/// Recognizes morphological variants the substring matcher cannot: +/// `"running" → "run"`, `"dogs" → "dog"`, `"SSNs" → "ssn"`. Cost +/// is one lowercase per keyword + one lowercase per lemma per +/// match attempt. +#[derive(Debug, Clone, Copy, Default)] +pub struct LemmaMatcher; + +impl KeywordMatcher for LemmaMatcher { + fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { + if tokens.is_empty() { + return SubstringMatcher.any_match(window, tokens, keywords); + } + let lowered_keywords: Vec = keywords + .iter() + .map(|k| k.as_str().to_ascii_lowercase()) + .collect(); + tokens.iter().any(|tok| { + let lemma = tok.lemma.as_str().to_ascii_lowercase(); + lowered_keywords.contains(&lemma) + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn kws(items: &[&'static str]) -> Vec> { + items.iter().copied().map(HipStr::from).collect() + } + + #[test] + fn matches_morph_variants() { + let tokens = vec![ + Token::from_text("the", 0..3), + Token::from_text("running", 4..11).with_lemma("run"), + Token::from_text("dogs", 12..16).with_lemma("dog"), + ]; + let m = LemmaMatcher; + assert!(m.any_match("", &tokens, &kws(&["run"]))); + assert!(m.any_match("", &tokens, &kws(&["dog"]))); + assert!(!m.any_match("", &tokens, &kws(&["cat"]))); + } + + #[test] + fn falls_back_to_substring_without_tokens() { + let m = LemmaMatcher; + assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); + } +} diff --git a/crates/nvisy-context/src/matching/matcher.rs b/crates/nvisy-context/src/matching/matcher.rs new file mode 100644 index 00000000..5e7ac560 --- /dev/null +++ b/crates/nvisy-context/src/matching/matcher.rs @@ -0,0 +1,70 @@ +//! [`KeywordMatcher`] trait + the default [`SubstringMatcher`]. + +use hipstr::HipStr; + +use crate::io::Token; + +/// Decide whether any keyword from `keywords` fires within the +/// candidate region around an entity match. +/// +/// The strategy slot that lets the enhancer swap raw substring +/// matching for lemma-aware matching (or a third-party +/// fuzzy/word-boundary implementation) without changing its core +/// pipeline. +/// +/// Implementations receive both a raw `window` slice of the source +/// text (for substring strategies) and the `tokens` covering that +/// same range (for token/lemma strategies). Either or both may be +/// ignored; `tokens` is empty when no NLP engine produced a token +/// artifact. +pub trait KeywordMatcher: Send + Sync { + /// `true` if at least one keyword from `keywords` appears in + /// the input. + fn any_match(&self, window: &str, tokens: &[Token], keywords: &[HipStr<'static>]) -> bool; +} + +/// ASCII case-insensitive substring matcher. The default — +/// runs whenever no token artifact was stamped on +/// `RecognizerInput.artifacts`, or whenever the caller explicitly +/// picks raw matching. +/// +/// Fast, allocation-light, permissive: the keyword `"email"` fires +/// inside `"MyEmailAddress"`. Ignores the `tokens` argument. +#[derive(Debug, Clone, Copy, Default)] +pub struct SubstringMatcher; + +impl KeywordMatcher for SubstringMatcher { + fn any_match(&self, window: &str, _tokens: &[Token], keywords: &[HipStr<'static>]) -> bool { + let lowered = window.to_ascii_lowercase(); + keywords + .iter() + .any(|kw| lowered.contains(kw.as_str().to_ascii_lowercase().as_str())) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn kws(items: &[&'static str]) -> Vec> { + items.iter().copied().map(HipStr::from).collect() + } + + #[test] + fn substring_matches_case_insensitively() { + let m = SubstringMatcher; + assert!(m.any_match("Your SSN: 123", &[], &kws(&["ssn"]))); + assert!(m.any_match( + "the SOCIAL SECURITY number", + &[], + &kws(&["social security"]) + )); + assert!(!m.any_match("nothing here", &[], &kws(&["ssn"]))); + } + + #[test] + fn substring_is_permissive() { + let m = SubstringMatcher; + assert!(m.any_match("MyEmailAddress", &[], &kws(&["email"]))); + } +} diff --git a/crates/nvisy-context/src/matching/mod.rs b/crates/nvisy-context/src/matching/mod.rs new file mode 100644 index 00000000..3ffe59cf --- /dev/null +++ b/crates/nvisy-context/src/matching/mod.rs @@ -0,0 +1,19 @@ +//! Keyword-matching strategies plugged into the [`Enhancer`]. +//! +//! - [`KeywordMatcher`] is the trait the enhancer talks to. +//! - [`SubstringMatcher`] is the default: ASCII case-insensitive +//! substring search over the raw text window. Runs whenever no +//! token artifact is present on `RecognizerInput.artifacts`. +//! - [`LemmaMatcher`] reads lemmatized tokens an upstream NLP +//! engine stamped on `RecognizerInput.artifacts`. Recognizes +//! morphological variants substring matching misses. +//! +//! All three are re-exported at the crate root. +//! +//! [`Enhancer`]: crate::Enhancer + +mod lemma; +mod matcher; + +pub use self::lemma::LemmaMatcher; +pub use self::matcher::{KeywordMatcher, SubstringMatcher}; diff --git a/crates/nvisy-core/src/recognition/input.rs b/crates/nvisy-core/src/recognition/input.rs index 2bbc57ba..6da30fd7 100644 --- a/crates/nvisy-core/src/recognition/input.rs +++ b/crates/nvisy-core/src/recognition/input.rs @@ -64,6 +64,14 @@ pub struct RecognizerInput { /// behavior for domain-specific terms; those that don't ignore the /// field. pub labels: Vec, + /// Out-of-band context strings the caller wants treated as + /// in-context for confidence boosting (e.g. the column header + /// of a CSV cell, the JSON object key of a string value, the + /// log field name a value sits under). Recognizers that run a + /// context enhancer feed these to the enhancer alongside the + /// in-text word window; recognizers without an enhancer ignore + /// the field. + pub context_hints: Vec, /// Correlation UUID propagated through the tracing span for this /// call. Recognizer bodies do not read this directly; it's set /// on the span by the caller. @@ -82,6 +90,7 @@ impl RecognizerInput { country: None, hints: Vec::new(), labels: Vec::new(), + context_hints: Vec::new(), correlation_id: None, } } @@ -128,6 +137,14 @@ impl RecognizerInput { self } + /// Attach out-of-band context hint strings (column headers, + /// JSON keys, …) the enhancer should treat as in-context. + #[must_use] + pub fn with_context_hints(mut self, hints: Vec) -> Self { + self.context_hints = hints; + self + } + /// Set the correlation id propagated through the tracing span. #[must_use] pub fn with_correlation_id(mut self, id: Uuid) -> Self { diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 08fe0410..94c69b56 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -45,5 +45,11 @@ aho-corasick = { workspace = true, features = [] } # Tabular document parsing (dictionary loading from CSV) csv = { workspace = true, features = [] } +# Base58Check decoder for the crypto.btc validator +bs58 = { workspace = true, features = ["check"] } + +# Region-aware phone-number parsing for the phone validator +phonenumber = { workspace = true } + [dev-dependencies] tokio = { workspace = true, features = ["macros", "rt"] } diff --git a/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml b/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml index 47ec45b8..85f2e388 100644 --- a/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml +++ b/crates/nvisy-pattern/assets/patterns/uk/identity/driving_licence.toml @@ -22,3 +22,4 @@ context = [ [[variants]] regex = '\b[A-Z9]{5}[0-9](?:0[1-9]|1[0-2]|5[1-9]|6[0-2])(?:0[1-9]|[12][0-9]|3[01])[0-9][A-Z9]{2}[A-Z0-9][A-Z]{2}\b' score = 0.5 +validator = "uk.driving_licence" diff --git a/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml b/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml index 0d585be7..d23047ee 100644 --- a/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml +++ b/crates/nvisy-pattern/assets/patterns/uk/vehicle/registration.toml @@ -29,6 +29,7 @@ context = [ [[variants]] regex = '\b[A-HJ-PR-Y][A-HJ-PR-Y](?:0[1-9]|[1-7][0-9])[- ]?[A-HJ-PR-Z]{3}\b' score = 0.3 +validator = "uk.vehicle_registration" [[variants]] regex = '\b[A-HJ-NPR-TV-Y]\d{1,3}[- ]?[A-HJ-PR-Y][A-HJ-PR-Z]{2}\b' diff --git a/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml b/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml index 80e80390..3859126e 100644 --- a/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml +++ b/crates/nvisy-pattern/assets/patterns/us/health/medical_license.toml @@ -26,11 +26,11 @@ context = [ # DEA registration type letters: A, B, C, D, E, F, G, H, J, K, L, # M, P, R, S, T, U, X (plus mid-2000s practitioner-9 series). [[variants]] -regex = '[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX][a-zA-Z]\d{7}' +regex = '\b[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX][a-zA-Z]\d{7}\b' score = 0.4 validator = "us.dea_number" [[variants]] -regex = '[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX]9\d{7}' +regex = '\b[abcdefghjklmprstuxABCDEFGHJKLMPRSTUX]9\d{7}\b' score = 0.4 validator = "us.dea_number" diff --git a/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml index 439529a6..7c650847 100644 --- a/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml +++ b/crates/nvisy-pattern/assets/patterns/us/identity/passport.toml @@ -1,6 +1,13 @@ name = "us-passport" label = "passport_number" countries = ["US"] +context = [ + "passport", + "passport#", + "travel document", + "us passport", + "united states passport", +] [[variants]] regex = "\\b[A-Z]\\d{8}\\b" diff --git a/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml b/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml index 53ee38c9..f6fd66f9 100644 --- a/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml +++ b/crates/nvisy-pattern/assets/patterns/us/identity/postal_code.toml @@ -1,7 +1,17 @@ name = "us-postal-code" label = "postal_code" countries = ["US"] +context = [ + "zip", + "zip code", + "zipcode", + "postal", + "postal code", + "address", + "mailing", +] [[variants]] regex = "\\b\\d{5}(?:-\\d{4})?\\b" -score = 0.5 +score = 0.1 +validator = "us.postal_code" diff --git a/crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml index 189aacc9..b4320c3c 100644 --- a/crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml +++ b/crates/nvisy-pattern/assets/patterns/world/credentials/aws_key.toml @@ -1,6 +1,26 @@ name = "aws-key" label = "api_key" +context = [ + "aws", + "amazon", + "access key", + "secret access key", + "aws_access_key_id", + "aws_secret_access_key", +] +# AWS access key ID — 20 chars, fixed 4-letter principal prefix +# plus 16 base32-ish chars. Prefixes per AWS docs: AKIA (IAM +# user), ASIA (STS temporary), AIDA (IAM user identifier), AROA +# (IAM role), ANPA / AGPA (managed policy / group), AIPA (EC2 +# instance profile). [[variants]] -regex = "\\bAKIA[0-9A-Z]{16}\\b" +regex = "\\b(?:AKIA|ASIA|AIDA|AROA|ANPA|AGPA|AIPA)[0-9A-Z]{16}\\b" score = 0.95 + +# AWS secret access key — 40 chars of base64url. Without context +# this collides with any 40-char base64 string (hashes, tokens), +# so the score is modest; the boost layer lifts colocated hits. +[[variants]] +regex = "\\b[A-Za-z0-9/+=]{40}\\b" +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml index bcd59d63..c67d4956 100644 --- a/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml +++ b/crates/nvisy-pattern/assets/patterns/world/credentials/generic_api_key.toml @@ -1,6 +1,10 @@ name = "generic-api-key" label = "api_key" +# Separator after the keyword is either `:` / `=` (assignment +# style: `api_key="…"`) or one-or-more spaces (header style: +# `Authorization: Bearer `). Accepting whitespace-only is +# what catches the dominant Authorization-header leak form. [[variants]] -regex = "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?" +regex = "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)(?:\\s*[:=]\\s*|\\s+)[\"']?([a-zA-Z0-9_\\-\\.]{20,})[\"']?" score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml index ba247e60..a1a76b41 100644 --- a/crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml +++ b/crates/nvisy-pattern/assets/patterns/world/credentials/github_token.toml @@ -1,6 +1,14 @@ name = "github-token" label = "auth_token" +# Classic prefix tokens — gh{p,o,u,s,r}_ for PAT, OAuth, +# user-to-server, server-to-server, refresh. [[variants]] regex = "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b" score = 0.95 + +# Fine-grained personal access token (introduced 2022) — current +# recommended PAT form, 82-char body over [A-Z0-9_]. +[[variants]] +regex = "\\bgithub_pat_[A-Z0-9_]{82}\\b" +score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml b/crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml index 61d6977e..7f0aaf59 100644 --- a/crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml +++ b/crates/nvisy-pattern/assets/patterns/world/credentials/private_key.toml @@ -2,5 +2,16 @@ name = "private-key" label = "private_key" [[variants]] -regex = "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----" +regex = "-----BEGIN (?:RSA |EC |DSA |OPENSSH |ENCRYPTED |PGP )?PRIVATE KEY(?: BLOCK)?-----[\\s\\S]*?-----END (?:RSA |EC |DSA |OPENSSH |ENCRYPTED |PGP )?PRIVATE KEY(?: BLOCK)?-----" +score = 0.98 + +# RFC 4716 SSH2 private key (Tectia, older SSH implementations). +[[variants]] +regex = "---- BEGIN SSH2 ENCRYPTED PRIVATE KEY ----[\\s\\S]*?---- END SSH2 ENCRYPTED PRIVATE KEY ----" +score = 0.98 + +# PuTTY .ppk (PuTTY-User-Key-File-2 or -3). Body extends to the +# `Private-MAC:` trailer which closes the keystore. +[[variants]] +regex = "PuTTY-User-Key-File-[23]:[\\s\\S]*?Private-MAC: [0-9a-f]+" score = 0.98 diff --git a/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml b/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml index 31f6fad6..c4c3118c 100644 --- a/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/bitcoin_address.toml @@ -1,6 +1,17 @@ name = "bitcoin-address" label = "crypto_address" +# Legacy Base58 addresses (P2PKH `1…`, P2SH `3…`) carry a +# four-byte double-SHA256 checksum that the `crypto.btc` +# validator verifies. [[variants]] -regex = "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b" +regex = "\\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\\b" +score = 0.5 +validator = "crypto.btc" + +# Bech32 / Bech32m segwit + Taproot. Length window covers +# v0 P2WPKH/P2WSH (42/62 chars total, body 39/59) and +# v1 P2TR Taproot (62 chars total, body 59). +[[variants]] +regex = "\\bbc1[a-z0-9]{25,59}\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml b/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml index d3412b4f..a06bd0fe 100644 --- a/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/credit_card.toml @@ -13,7 +13,10 @@ es = ["tarjeta", "crédito", "credito", "débito", "debito", "pago", "visa", "ma de = ["karte", "kredit", "kreditkarte", "debit", "zahlung", "visa", "mastercard", "amex"] fr = ["carte", "crédit", "credit", "débit", "debit", "paiement", "visa", "mastercard", "amex"] +# Brand BIN ranges: Visa (4), Mastercard (51-55 and 2-series +# 2221-2720 introduced 2017), Discover (6), Amex/Diners (3), +# and a loose 1xxx catch-all that Luhn filters down. [[variants]] -regex = '\b(?:(?:4\d{3})|(?:5[0-5]\d{2})|(?:6\d{3})|(?:1\d{3})|(?:3\d{3}))[- ]?(?:\d{3,4})[- ]?(?:\d{3,4})[- ]?(?:\d{3,5})\b' -score = 0.5 +regex = '\b(?:(?:4\d{3})|(?:5[0-5]\d{2})|(?:2[2-7]\d{2})|(?:6\d{3})|(?:1\d{3})|(?:3\d{3}))[- ]?(?:\d{3,4})[- ]?(?:\d{3,4})[- ]?(?:\d{3,5})\b' +score = 0.3 validator = "luhn" diff --git a/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml b/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml index 3680ee9e..79d2bf30 100644 --- a/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml +++ b/crates/nvisy-pattern/assets/patterns/world/finance/iban.toml @@ -3,6 +3,6 @@ label = "iban" context = ["iban", "bank", "account", "transfer", "swift"] [[variants]] -regex = "\\b[A-Z]{2}\\d{2}\\s?[A-Z0-9]{4}\\s?(?:\\d{4}\\s?){2,7}\\d{1,4}\\b" +regex = "\\b[A-Z]{2}\\d{2}[\\s\\-]?[A-Z0-9]{4}[\\s\\-]?(?:[A-Z0-9]{4}[\\s\\-]?){2,7}[A-Z0-9]{1,4}\\b" score = 0.5 validator = "iban" diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index 9f6bbb3c..17d280f6 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -13,7 +13,7 @@ use super::compiled::{CompiledDictionary, CompiledPattern, has_word_boundaries}; use super::dictionary::Dictionary; use super::regex::Regex; use crate::shipped; -use crate::validators::ValidatorRegistry; +use crate::validators::{ValidationContext, ValidatorRegistry}; /// Runtime text recognizer composed of a regex pool and an /// Aho-Corasick automaton. @@ -434,9 +434,13 @@ impl EntityRecognizer for PatternRecognizer { if !input.applies_to_country(&pat.countries) { continue; } + let ctx = ValidationContext { + country: input.country, + language: input.language.clone(), + }; for m in pat.regex.find_iter(text) { if let Some(validator) = pat.validator.as_ref() - && !validator.validate(m.as_str()) + && !validator.validate(m.as_str(), &ctx) { continue; } diff --git a/crates/nvisy-pattern/src/validators/btc.rs b/crates/nvisy-pattern/src/validators/btc.rs new file mode 100644 index 00000000..7abc6d67 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/btc.rs @@ -0,0 +1,53 @@ +//! Bitcoin legacy-address (Base58Check) checksum validator. +//! +//! Validates P2PKH (`1…`) and P2SH (`3…`) addresses by decoding +//! the Base58 payload and verifying its trailing four-byte +//! double-SHA256 checksum. Bech32 / Bech32m addresses (`bc1…`) +//! are not handled here — those use a different polynomial check. + +/// Return `true` if `value` is a structurally valid Base58Check +/// Bitcoin address. +/// +/// Accepts P2PKH (version byte `0x00`, `1…`) and P2SH +/// (version byte `0x05`, `3…`) on mainnet. Rejects mismatched +/// version bytes, broken Base58, and bad checksums. +pub fn btc(value: &str) -> bool { + match bs58::decode(value.trim()).with_check(None).into_vec() { + Ok(bytes) if bytes.len() == 21 => matches!(bytes[0], 0x00 | 0x05), + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_known_p2pkh() { + // Satoshi's genesis-block coinbase address. + assert!(btc("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa")); + } + + #[test] + fn accepts_known_p2sh() { + assert!(btc("3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy")); + } + + #[test] + fn rejects_bad_checksum() { + // Final char flipped. + assert!(!btc("1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNb")); + } + + #[test] + fn rejects_non_base58() { + assert!(!btc("1A1zP1eP5QGefi2DMPTfTL5SLmv7Divf0a")); + assert!(!btc("")); + } + + #[test] + fn rejects_unknown_version() { + // Bitcoin testnet P2PKH (version byte `0x6f`). + assert!(!btc("mipcBbFg9gMiCh81Kj8tqqdgoZub1ZJRfn")); + } +} diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index 4eb8d853..39722f54 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -8,16 +8,19 @@ //! can't. //! //! [`ValidatorRegistry::builtin`] ships universal validators -//! ([`luhn`], [`iban`], [`phone`], [`date`]) plus jurisdiction- -//! scoped sets re-exported from [`us`] (`"us.ssn"`, -//! `"us.aba_routing"`, `"us.npi"`, `"us.dea_number"`) and [`uk`] -//! (`"uk.nhs"`, `"uk.nino"`). Each validator is also re-exported +//! ([`luhn`], [`iban`], [`phone`], [`date`], [`btc`]) plus +//! jurisdiction-scoped sets re-exported from [`us`] (`"us.ssn"`, +//! `"us.aba_routing"`, `"us.npi"`, `"us.dea_number"`, +//! `"us.postal_code"`) and [`uk`] +//! (`"uk.nhs"`, `"uk.nino"`, `"uk.driving_licence"`, +//! `"uk.vehicle_registration"`). Each validator is also re-exported //! as a free function so consumers can compose a custom registry //! without taking the full set. //! //! [`Variant`]: crate::Variant //! [`Regex`]: crate::Regex +mod btc; mod date; mod iban; mod luhn; @@ -30,29 +33,52 @@ use std::borrow::Cow; use std::collections::HashMap; use std::sync::Arc; +use nvisy_core::primitive::{CountryCode, LanguageTag}; + +pub use self::btc::btc; pub use self::date::date; pub use self::iban::iban; pub use self::luhn::luhn; pub use self::phone::phone; +/// Per-call hints supplied to validators alongside the matched +/// string. +/// +/// Carries the caller's [`RecognizerInput`] jurisdiction and +/// language so validators that need region-aware semantics +/// (e.g. `phone`) can honour the caller's intent instead of +/// guessing across a fixed fallback set. Validators that don't +/// need either field can ignore it via `_ctx`. +/// +/// [`RecognizerInput`]: crate::recognition::RecognizerInput +#[derive(Debug, Clone, Default)] +pub struct ValidationContext { + /// ISO 3166-1 alpha-2 jurisdiction associated with the input, + /// when the caller specified one. + pub country: Option, + /// BCP-47 language tag associated with the input, when the + /// caller specified one. + pub language: Option, +} + /// Post-match validator returning whether a matched string is /// structurally valid. /// -/// Implemented by every `Fn(&str) -> bool + Send + Sync` via the -/// blanket impl, so plain function pointers slot in without a -/// wrapper type. Implement directly for types that need to carry -/// state (e.g. a remote-lookup client). +/// Implemented by every `Fn(&str, &ValidationContext) -> bool + +/// Send + Sync` via the blanket impl, so plain function pointers +/// slot in without a wrapper type. Implement directly for types +/// that need to carry state (e.g. a remote-lookup client). pub trait Validator: Send + Sync { /// Return `true` to keep the match, `false` to drop it. - fn validate(&self, matched: &str) -> bool; + fn validate(&self, matched: &str, ctx: &ValidationContext) -> bool; } impl Validator for F where - F: Fn(&str) -> bool + Send + Sync, + F: Fn(&str, &ValidationContext) -> bool + Send + Sync, { - fn validate(&self, matched: &str) -> bool { - self(matched) + fn validate(&self, matched: &str, ctx: &ValidationContext) -> bool { + self(matched, ctx) } } @@ -81,32 +107,38 @@ impl ValidatorRegistry { /// Construct a registry pre-loaded with the shipped built-in /// validators. /// - /// Universal keys: `"luhn"`, `"iban"`, `"phone"`, `"date"`. + /// Universal keys: `"luhn"`, `"iban"`, `"phone"`, `"date"`, + /// `"crypto.btc"`. /// /// US-scoped: `"us.ssn"`, `"us.aba_routing"`, `"us.npi"`, - /// `"us.dea_number"`. + /// `"us.dea_number"`, `"us.postal_code"`. /// - /// UK-scoped: `"uk.nhs"`, `"uk.nino"`. + /// UK-scoped: `"uk.nhs"`, `"uk.nino"`, + /// `"uk.driving_licence"`, `"uk.vehicle_registration"`. #[must_use] pub fn builtin() -> Self { Self::empty() - .with("luhn", luhn) - .with("iban", iban) + .with_simple("luhn", luhn) + .with_simple("iban", iban) .with("phone", phone) - .with("date", date) - .with("us.ssn", us::ssn) - .with("us.aba_routing", us::aba_routing) - .with("us.npi", us::npi) - .with("us.dea_number", us::dea_number) - .with("uk.nhs", uk::nhs) - .with("uk.nino", uk::nino) + .with_simple("date", date) + .with_simple("crypto.btc", btc) + .with_simple("us.ssn", us::ssn) + .with_simple("us.aba_routing", us::aba_routing) + .with_simple("us.npi", us::npi) + .with_simple("us.dea_number", us::dea_number) + .with_simple("us.postal_code", us::postal_code) + .with_simple("uk.nhs", uk::nhs) + .with_simple("uk.nino", uk::nino) + .with_simple("uk.driving_licence", uk::driving_licence) + .with_simple("uk.vehicle_registration", uk::vehicle_registration) } - /// Register `validator` under `name`, overwriting any previous - /// entry with the same key. + /// Register a context-aware `validator` under `name`, + /// overwriting any previous entry with the same key. /// /// Override a built-in by registering under the same name - /// (e.g. `"luhn"`). + /// (e.g. `"phone"`). #[must_use] pub fn with(mut self, name: N, validator: V) -> Self where @@ -117,6 +149,19 @@ impl ValidatorRegistry { self } + /// Register a context-free `Fn(&str) -> bool` validator under + /// `name`. Convenience wrapper around [`Self::with`] for the + /// common case where the validator ignores + /// [`ValidationContext`]. + #[must_use] + pub fn with_simple(self, name: N, validator: F) -> Self + where + N: Into>, + F: Fn(&str) -> bool + Send + Sync + 'static, + { + self.with(name, move |s: &str, _: &ValidationContext| validator(s)) + } + /// Look up a validator by name. /// /// Returns `None` when the name is unregistered; the diff --git a/crates/nvisy-pattern/src/validators/phone.rs b/crates/nvisy-pattern/src/validators/phone.rs index 51d4cdd6..9539f0d4 100644 --- a/crates/nvisy-pattern/src/validators/phone.rs +++ b/crates/nvisy-pattern/src/validators/phone.rs @@ -1,73 +1,38 @@ -//! Phone-number structural validator. - -/// Return `true` if `value` has a plausible phone-number structure. -/// -/// All non-digit characters are stripped, then checks: -/// -/// - 7 to 15 digits (the ITU-T E.164 range). -/// - When the original begins with `+` (explicit E.164), the -/// digits must not start with `0` — no country code is `0…`. -/// National formats such as UK `020 7946 0958` keep their -/// trunk-prefix zero and remain valid. -pub fn phone(value: &str) -> bool { - let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); - let len = digits.len(); - - if !(7..=15).contains(&len) { - return false; - } - - if value.trim_start().starts_with('+') && digits.starts_with('0') { - return false; - } - - true -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn valid_us_numbers() { - assert!(phone("+1-555-123-4567")); - assert!(phone("(555) 123-4567")); - assert!(phone("555.123.4567")); - assert!(phone("5551234567")); - } - - #[test] - fn valid_international() { - assert!(phone("+44 20 7946 0958")); - assert!(phone("+49 30 12345678")); - assert!(phone("+81 3 1234 5678")); - } - - #[test] - fn too_few_digits() { - assert!(!phone("12345")); - assert!(!phone("123-45")); - } - - #[test] - fn too_many_digits() { - assert!(!phone("1234567890123456")); - } - - #[test] - fn e164_starting_with_zero_rejected() { - assert!(!phone("+0123456789012")); - } - - #[test] - fn national_format_with_trunk_zero_accepted() { - // UK national format keeps the leading 0 trunk prefix. - assert!(phone("020 7946 0958")); - assert!(phone("0207946 0958")); - } - - #[test] - fn local_number_with_seven_digits() { - assert!(phone("123-4567")); - } +//! Region-aware phone-number validator backed by the +//! `phonenumber` crate (Rust port of Google's libphonenumber). +//! +//! Two paths: +//! +//! 1. Inputs that parse as E.164 (carry their own `+CC` prefix) +//! validate directly, regardless of caller context. +//! 2. Inputs in national format (no leading `+`) need a region +//! hint. When [`ValidationContext::country`] is set we use it; +//! otherwise we fail closed — region-less national-format +//! matching is genuinely ambiguous (a 13-digit run can be a +//! valid IL/IN phone *and* the leading 13 digits of a Visa +//! PAN), so without a country signal we'd rather miss a +//! handful of national-format numbers than mislabel card and +//! account numbers as phones. + +use phonenumber::country::Id; +use phonenumber::parse; +use std::str::FromStr; + +use super::ValidationContext; + +/// Return `true` when `value` parses as a valid phone number +/// for the caller's jurisdiction (or as E.164 with an explicit +/// `+CC` prefix). +pub fn phone(value: &str, ctx: &ValidationContext) -> bool { + let trimmed = value.trim(); + + if parse(None, trimmed).map(|n| n.is_valid()).unwrap_or(false) { + return true; + } + + ctx.country + .and_then(|c| Id::from_str(c.as_str()).ok()) + .and_then(|region| parse(Some(region), trimmed).ok()) + .map(|n| n.is_valid()) + .unwrap_or(false) } diff --git a/crates/nvisy-pattern/src/validators/uk/driving_licence.rs b/crates/nvisy-pattern/src/validators/uk/driving_licence.rs new file mode 100644 index 00000000..a914a47a --- /dev/null +++ b/crates/nvisy-pattern/src/validators/uk/driving_licence.rs @@ -0,0 +1,71 @@ +//! UK Driving Licence (DVLA) structural validator. +//! +//! The 16-char DVLA number opens with a 5-char surname slot — +//! letters padded on the right with `9`s when the surname is +//! shorter than five characters. A licence whose surname slot +//! is *all* `9`s, or that places a `9` before a letter (e.g. +//! `9ABCD…`, `A9BCD…`), violates the padding rule and is +//! structurally invalid. + +/// Return `true` when the leading 5-char surname slot of a +/// 16-char DVLA driving licence number is structurally valid. +/// +/// Rejects an all-`9` surname and any `9` that appears before a +/// letter within the slot. Does not re-validate the rest of the +/// regex-matched number — that is the regex's job. +pub fn driving_licence(value: &str) -> bool { + let surname: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace()) + .take(5) + .collect(); + if surname.len() != 5 { + return false; + } + if surname.iter().all(|c| *c == '9') { + return false; + } + let mut padding_started = false; + for c in &surname { + match c { + '9' => padding_started = true, + c if c.is_ascii_uppercase() => { + if padding_started { + return false; + } + } + _ => return false, + } + } + true +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_padded_short_surname() { + // 4-letter surname `MORG` padged with one `9`. + assert!(driving_licence("MORG9753116SM9IJ")); + // 5-letter surname `MORGA`, no padding. + assert!(driving_licence("MORGA753116SM9IJ")); + } + + #[test] + fn rejects_all_nine_surname() { + assert!(!driving_licence("99999753116SM9IJ")); + } + + #[test] + fn rejects_padding_before_letter() { + // `9` precedes a letter in the surname slot. + assert!(!driving_licence("9MORG753116SM9IJ")); + assert!(!driving_licence("A9ORG753116SM9IJ")); + } + + #[test] + fn rejects_non_alpha_padding_in_surname() { + assert!(!driving_licence("M0RGA753116SM9IJ")); + } +} diff --git a/crates/nvisy-pattern/src/validators/uk/mod.rs b/crates/nvisy-pattern/src/validators/uk/mod.rs index 37eadd66..44f7c352 100644 --- a/crates/nvisy-pattern/src/validators/uk/mod.rs +++ b/crates/nvisy-pattern/src/validators/uk/mod.rs @@ -1,12 +1,17 @@ //! UK-specific post-match validators. //! //! Registered under the [`ValidatorRegistry::builtin`] set with -//! dotted names — `"uk.nhs"`, `"uk.nino"`. +//! dotted names — `"uk.nhs"`, `"uk.nino"`, +//! `"uk.driving_licence"`, `"uk.vehicle_registration"`. //! //! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin +mod driving_licence; mod nhs; mod nino; +mod vehicle_registration; +pub use self::driving_licence::driving_licence; pub use self::nhs::nhs; pub use self::nino::nino; +pub use self::vehicle_registration::vehicle_registration; diff --git a/crates/nvisy-pattern/src/validators/uk/nino.rs b/crates/nvisy-pattern/src/validators/uk/nino.rs index 0ff86604..4933c2b8 100644 --- a/crates/nvisy-pattern/src/validators/uk/nino.rs +++ b/crates/nvisy-pattern/src/validators/uk/nino.rs @@ -6,9 +6,14 @@ /// Return `true` when `value`'s leading two-letter prefix is not /// a reserved NINO prefix. /// -/// Reserved prefixes (case-insensitive): `BG`, `GB`, `NK`, `KN`, -/// `NT`, `TN`, `ZZ`. The check is structural only — it does not -/// confirm the trailing suffix letter or any HMRC issuance state. +/// Reserved prefixes (case-insensitive): +/// +/// - Whole pair: `BG`, `GB`, `NK`, `KN`, `NT`, `TN`, `ZZ`. +/// - First letter `O` (HMRC reserved; not blocked by the regex +/// character class, which spans `j-p`). +/// +/// The check is structural only — it does not confirm the +/// trailing suffix letter or any HMRC issuance state. pub fn nino(value: &str) -> bool { let prefix: String = value .chars() @@ -19,6 +24,9 @@ pub fn nino(value: &str) -> bool { return false; } let upper = prefix.to_ascii_uppercase(); + if upper.starts_with('O') { + return false; + } !matches!( upper.as_str(), "BG" | "GB" | "NK" | "KN" | "NT" | "TN" | "ZZ" @@ -54,4 +62,10 @@ mod tests { assert!(!nino("12345678A")); assert!(!nino("")); } + + #[test] + fn rejects_o_at_position_zero() { + assert!(!nino("OA123456A")); + assert!(!nino("oa123456A")); + } } diff --git a/crates/nvisy-pattern/src/validators/uk/vehicle_registration.rs b/crates/nvisy-pattern/src/validators/uk/vehicle_registration.rs new file mode 100644 index 00000000..7ff755e5 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/uk/vehicle_registration.rs @@ -0,0 +1,69 @@ +//! UK current-format Vehicle Registration Mark (VRM) age-ID +//! validator. +//! +//! Current (2001+) plates encode the issuance half-year as a +//! 2-digit "age identifier" at positions 3-4: +//! +//! - March issue: `02..=29` (March 2002 through March 2029) +//! - September issue: `51..=79` (September 2001 through September +//! 2029) +//! +//! The recognizer regex permits the broader range `01..=79` +//! (cheap to express); this validator narrows it to the issued +//! windows that the DVLA actually allocates. + +/// Return `true` when the 2-digit age identifier embedded in a +/// 7-char current-format UK plate falls inside an issued range. +/// +/// Strips whitespace and `-`, then reads characters at positions +/// 2 and 3 of the canonicalized string. +pub fn vehicle_registration(value: &str) -> bool { + let chars: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + if chars.len() != 7 { + return false; + } + let age = match (chars[2].to_digit(10), chars[3].to_digit(10)) { + (Some(a), Some(b)) => a * 10 + b, + _ => return false, + }; + matches!(age, 2..=29 | 51..=79) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_march_window() { + assert!(vehicle_registration("AB02ABC")); + assert!(vehicle_registration("AB29ABC")); + assert!(vehicle_registration("AB 15 ABC")); + } + + #[test] + fn accepts_september_window() { + assert!(vehicle_registration("AB51ABC")); + assert!(vehicle_registration("AB79ABC")); + assert!(vehicle_registration("AB-65-ABC")); + } + + #[test] + fn rejects_out_of_range() { + // 01 was used briefly in 2001 but is not in the modern + // issued range; presidio rejects it too. + assert!(!vehicle_registration("AB01ABC")); + assert!(!vehicle_registration("AB30ABC")); + assert!(!vehicle_registration("AB50ABC")); + assert!(!vehicle_registration("AB80ABC")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!vehicle_registration("AB51AB")); + assert!(!vehicle_registration("AB51ABCD")); + assert!(!vehicle_registration("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/us/mod.rs b/crates/nvisy-pattern/src/validators/us/mod.rs index 60b1d300..1a4d007c 100644 --- a/crates/nvisy-pattern/src/validators/us/mod.rs +++ b/crates/nvisy-pattern/src/validators/us/mod.rs @@ -8,9 +8,11 @@ mod aba_routing; mod dea_number; mod npi; +mod postal_code; mod ssn; pub use self::aba_routing::aba_routing; pub use self::dea_number::dea_number; pub use self::npi::npi; +pub use self::postal_code::postal_code; pub use self::ssn::ssn; diff --git a/crates/nvisy-pattern/src/validators/us/postal_code.rs b/crates/nvisy-pattern/src/validators/us/postal_code.rs new file mode 100644 index 00000000..96a193a6 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/us/postal_code.rs @@ -0,0 +1,40 @@ +//! US ZIP / ZIP+4 sanity validator. + +/// Return `true` if `value` is a plausible US ZIP code. +/// +/// Accepts the 5-digit and 5-4 (`12345-1234`) forms; rejects the +/// reserved all-zeros prefix (`00000`) which is not assigned by the +/// USPS but is a frequent stand-in for "unknown". +pub fn postal_code(value: &str) -> bool { + let digits: Vec = value.chars().filter(char::is_ascii_digit).collect(); + if digits.len() != 5 && digits.len() != 9 { + return false; + } + !digits[..5].iter().all(|c| *c == '0') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_valid() { + assert!(postal_code("90210")); + assert!(postal_code("97477-1234")); + // USPS lowest assigned prefix is 00501 (Holtsville, NY). + assert!(postal_code("00501")); + } + + #[test] + fn rejects_all_zero_prefix() { + assert!(!postal_code("00000")); + assert!(!postal_code("00000-1234")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!postal_code("1234")); + assert!(!postal_code("123456")); + assert!(!postal_code("")); + } +} diff --git a/crates/nvisy-pattern/testdata/builtin/uk/identity.txt b/crates/nvisy-pattern/testdata/builtin/uk/identity.txt deleted file mode 100644 index 05dc0b42..00000000 --- a/crates/nvisy-pattern/testdata/builtin/uk/identity.txt +++ /dev/null @@ -1,12 +0,0 @@ -Patient handover for Mrs A. Patel. - -Personal details: - - NHS number: 943 476 5919 - - NINO: AB123456C - - Driving licence (DVLA): MORGA753116SM9IJ - - Address: 10 Downing Street, London SW1A 2AA - -Vehicle: BMW 3 Series, registration AB51 ABC, V5C on file. - -Please update the patient record (national health service form 4) -and bill the National Insurance reference shown above. diff --git a/crates/nvisy-toolkit/src/detection/chunks.rs b/crates/nvisy-toolkit/src/detection/chunks.rs index 6ae8f07c..2567b26b 100644 --- a/crates/nvisy-toolkit/src/detection/chunks.rs +++ b/crates/nvisy-toolkit/src/detection/chunks.rs @@ -71,7 +71,8 @@ impl RecognizerRegistryExt for RecognizerRegistry { { let mut out = Vec::new(); while let Some(chunk) = handler.next_chunk().await? { - let input = RecognizerInput::new(chunk.data.clone().into()); + let input = RecognizerInput::new(chunk.data.clone().into()) + .with_context_hints(M::chunk_hints(&chunk.location)); let text_entities = self.run::(input).await?; for text_entity in text_entities { let Some(loc) = handler @@ -108,6 +109,20 @@ pub trait LiftedFromText: Modality + Sized { /// against the source bytes of a chunk, plus the pre-lifted /// location, and produce a `Self`-modality entity. fn from_text(text_entity: Entity, location: Self::Location) -> Entity; + + /// Out-of-band context strings the recognizer should treat + /// as in-context for a chunk at `location`. Surfaces handler + /// metadata that doesn't live inside the chunk's payload — + /// notably the column header of a CSV/XLSX cell, which lifts + /// confidence on a low-base-score regex match the way the + /// surrounding sentence would in plain text. + /// + /// Default returns an empty `Vec`; modalities whose chunks + /// don't carry out-of-band metadata (`Text`, image regions, + /// audio segments) keep that default. + fn chunk_hints(_location: &Self::Location) -> Vec { + Vec::new() + } } impl LiftedFromText for Text { @@ -133,4 +148,8 @@ impl LiftedFromText for Tabular { } builder.build().expect("entity reshape") } + + fn chunk_hints(location: &TabularLocation) -> Vec { + location.column_name.iter().cloned().collect() + } } From c63a0e40722e93acd9905c7309f430a48653f188 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Tue, 16 Jun 2026 08:12:30 +0200 Subject: [PATCH 12/14] feat(codec,toolkit): chunk-level context hints for HTML + JSON MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the tabular-cell hint mechanism to all chunked formats by making hints a first-class field on Chunk, populated alongside data + location during next_chunk. Architecture: - Chunk gains pub hints: Vec. Hints are metadata the chunk's structural neighbours surface — CSV column headers, HTML parent-element text, JSON object keys — for downstream context-aware recognizers. - nvisy-toolkit detect() reads chunk.hints directly. The earlier LiftedFromText::chunk_hints and the briefly-added Handler::chunk_hints methods are both removed; the field on Chunk avoids a second handler call to recompute information next_chunk already had. - Handlers without useful out-of-band metadata (TXT, PDF, image, audio) initialise the field with Vec::new(). CSV handler: - Populates chunk.hints from chunk.location.column_name. Replaces the prior LiftedFromText::chunk_hints override, which had the same effect via a less-direct path. HTML handler: - RedactableItem gains pub hints: Vec. The DOM walk in build_items computes a per-text-node hint by collecting the text of the node's nearest block-level ancestor (excluding the node's own text). nearest_block_ancestor walks parents until it finds a tag in is_block_element's curated set (p, div, li, td, th, h1-h6, blockquote, dt, dd, section, article, aside, header, footer, main, nav, figcaption, caption). - Stopping at the immediate inline parent would yield only the chunk's own text — the surrounding sentence lives in the enclosing block. `

...the payment card 4111… is on file

` gives the chunk a hint of "the payment card is on file", which lifts CC=0.3 above threshold via the existing context-enhancer. - Note: neither html5ever, markup5ever, nor scraper exposes a block/inline classifier; the curated list is the simplest honest implementation. Future HTML elements not in the list graciously degrade (walk continues to root, hints stay empty) rather than corrupting detection. JSON handler: - Leaf gains pub hints: Vec. parse_value and parse_array now thread an Option<&str> key_context; parse_object captures the just-parsed key and passes it to parse_value for the value. Array elements inherit the containing object's key so {"cards": ["4111…", "5555…"]} gives both PANs the "cards" hint. Top-level scalars and keys themselves stay hint-less. - The leaf's hint is copied onto Chunk.hints in next_chunk so recognizers see it via input.context_hints. Resolves codec_e2e_html and codec_e2e_json payment_card assertions without touching scores. Co-Authored-By: Claude Opus 4.7 --- crates/nvisy-codec/src/core/handler.rs | 10 +- .../src/handler/audio/mp3_handler.rs | 2 +- .../src/handler/audio/wav_handler.rs | 2 +- .../nvisy-codec/src/handler/image/macros.rs | 1 + .../src/handler/rich/pdf_handler.rs | 1 + .../src/handler/tabular/csv_handler.rs | 3 +- .../src/handler/text/html_handler.rs | 11 ++- .../src/handler/text/html_loader.rs | 91 +++++++++++++++++++ .../src/handler/text/json_handler.rs | 34 +++++-- .../src/handler/text/txt_handler.rs | 1 + crates/nvisy-toolkit/src/detection/chunks.rs | 20 +--- 11 files changed, 144 insertions(+), 32 deletions(-) diff --git a/crates/nvisy-codec/src/core/handler.rs b/crates/nvisy-codec/src/core/handler.rs index 83106862..9ecd97b6 100644 --- a/crates/nvisy-codec/src/core/handler.rs +++ b/crates/nvisy-codec/src/core/handler.rs @@ -29,13 +29,21 @@ use crate::content::{ContentData, ContentSource}; /// /// `data` is the per-modality wire payload; `location` is the /// coordinate the handler will accept in [`Handler::read`] / -/// [`Handler::redact`] to address the same chunk again. +/// [`Handler::redact`] to address the same chunk again. `hints` +/// carries out-of-band context strings the chunk's structural +/// neighbours surface — CSV/XLSX column headers, JSON object +/// keys, HTML parent-element text — for downstream context-aware +/// recognizers; handlers without such metadata leave it empty. #[derive(Debug, Clone, PartialEq)] pub struct Chunk { /// Coordinate addressing this chunk inside the handler. pub location: M::Location, /// Wire payload at the chunk's location. pub data: M::Data, + /// Out-of-band context strings recognizers should treat as + /// in-context (column headers, parent element text, …). + /// Empty when the handler has no such metadata to surface. + pub hints: Vec, } /// Per-modality capability trait every format handler implements. diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index e3d376a6..daa9894c 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -108,7 +108,7 @@ impl Handler