diff --git a/crates/nvisy-cli/src/config/mod.rs b/crates/nvisy-cli/src/config/mod.rs index 03853fe3..c2fa93b9 100644 --- a/crates/nvisy-cli/src/config/mod.rs +++ b/crates/nvisy-cli/src/config/mod.rs @@ -29,7 +29,7 @@ use std::time::Duration; use anyhow::Context; use clap::{Args, Parser}; -use nvisy_engine::pipeline::RuntimeConfig; +use nvisy_engine::core::RuntimeConfig; use serde::Deserialize; pub use self::server::ServerConfig; diff --git a/crates/nvisy-codec/src/handler/audio/duration.rs b/crates/nvisy-codec/src/handler/audio/duration.rs index a09e62ec..421ccfa5 100644 --- a/crates/nvisy-codec/src/handler/audio/duration.rs +++ b/crates/nvisy-codec/src/handler/audio/duration.rs @@ -44,16 +44,18 @@ const TARGET: &str = "nvisy_codec::handler::audio::duration"; /// first track lacks a timebase or a known duration, or when the /// computed duration would overflow `i64` microseconds. pub(super) fn probe_duration_us(bytes: &Bytes, extension_hint: &str) -> Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension(extension_hint); let reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("audio probe failed: {e}"), TARGET))?; let track = reader @@ -61,9 +63,9 @@ pub(super) fn probe_duration_us(bytes: &Bytes, extension_hint: &str) -> Result Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension("mp3"); let reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("MP3 probe failed: {e}"), TARGET))?; let track = reader @@ -87,15 +89,17 @@ pub(super) fn probe_channels(bytes: &Bytes) -> Result { /// [`super::redact::apply`] helper and for handing back to /// [`encode_from_pcm`]. pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension("mp3"); let mut reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("MP3 probe failed: {e}"), TARGET))?; let track = reader @@ -110,9 +114,9 @@ pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { .ok_or_else(|| Error::validation("MP3 track is missing audio codec params", TARGET))? .clone(); - let sample_rate = audio_params.sample_rate.ok_or_else(|| { - Error::validation("MP3 track is missing a sample rate", TARGET) - })?; + let sample_rate = audio_params + .sample_rate + .ok_or_else(|| Error::validation("MP3 track is missing a sample rate", TARGET))?; let channels = audio_params .channels .as_ref() @@ -163,10 +167,7 @@ pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { continue; } Err(e) => { - return Err(Error::validation( - format!("MP3 decode failed: {e}"), - TARGET, - )); + return Err(Error::validation(format!("MP3 decode failed: {e}"), TARGET)); } } } @@ -250,8 +251,8 @@ pub(super) fn encode_from_pcm( ) -> Result, Error> { let bitrate = snap_bitrate(target_bitrate_bps); - let mut encoder = Builder::new() - .ok_or_else(|| Error::validation("LAME builder failed", TARGET))?; + let mut encoder = + Builder::new().ok_or_else(|| Error::validation("LAME builder failed", TARGET))?; encoder .set_sample_rate(sample_rate) .map_err(|e| Error::validation(format!("LAME sample-rate rejected: {e:?}"), TARGET))?; diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index b37d6a7a..e3d376a6 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -24,10 +24,9 @@ use nvisy_core::modality::{Audio, AudioData, AudioLocation}; use nvisy_core::primitive::TimeSpan; use nvisy_core::redaction::Redactions; -use super::Mp3Loader; use super::duration::probe_duration_us; use super::mp3_codec::{decode_to_pcm, encode_from_pcm}; -use super::redact; +use super::{Mp3Loader, redact}; use crate::content::{ContentData, ContentSource}; use crate::{Chunk, Format, FormatId, Handler}; @@ -229,7 +228,10 @@ mod tests { // boundary where smear is biggest. let start = sr * 3 / 4; let end = sr * 5 / 4; - let mean_abs: f32 = decoded.samples[start..end].iter().map(|s| s.abs()).sum::() + let mean_abs: f32 = decoded.samples[start..end] + .iter() + .map(|s| s.abs()) + .sum::() / (end - start) as f32; assert!( mean_abs < 0.05, diff --git a/crates/nvisy-codec/src/handler/audio/mp3_loader.rs b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs index 6106b058..1b6010ec 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_loader.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs @@ -68,20 +68,19 @@ mod tests { #[tokio::test] async fn accepts_stereo_mp3() { - let loader = Mp3Loader; let bytes = fixture_stereo_mp3(); let content = ContentData::new(ContentSource::new(), bytes); - loader.decode(content).await.expect("stereo MP3 should load"); + let handler = Mp3Loader.decode(content).await; + handler.expect("stereo MP3 should load"); } #[tokio::test] async fn rejects_garbage_bytes() { - let loader = Mp3Loader; let content = ContentData::new( ContentSource::new(), Bytes::from_static(b"definitely not an mp3"), ); - let err = loader.decode(content).await.unwrap_err(); + let err = Mp3Loader.decode(content).await.unwrap_err(); assert!(err.to_string().contains("MP3 probe failed")); } } diff --git a/crates/nvisy-core/src/context/enhancer.rs b/crates/nvisy-core/src/context/enhancer.rs index 1c0501ba..38ff3794 100644 --- a/crates/nvisy-core/src/context/enhancer.rs +++ b/crates/nvisy-core/src/context/enhancer.rs @@ -221,7 +221,8 @@ mod tests { use super::*; use crate::context::Context; use crate::entity::{ - EntityKind, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, + EntityLabelRef, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, + builtins, }; use crate::modality::{Text, TextLocation}; @@ -240,7 +241,7 @@ mod tests { format!("pattern `{name}` matched"), ); Entity::builder() - .with_entity_kind(EntityKind::GovernmentId) + .with_label(EntityLabelRef::from(builtins::GOVERNMENT_ID.name.clone())) .with_trail(vec![step]) .with_confidence(confidence) .with_location(TextLocation::new(span.start, span.end)) @@ -258,7 +259,7 @@ mod tests { format!("model `{name}` matched"), ); Entity::builder() - .with_entity_kind(EntityKind::PersonName) + .with_label(EntityLabelRef::from(builtins::PERSON_NAME.name.clone())) .with_trail(vec![step]) .with_confidence(confidence) .with_location(TextLocation::new(span.start, span.end)) diff --git a/crates/nvisy-core/src/entity/annotation.rs b/crates/nvisy-core/src/entity/annotation.rs index b399ced6..5f9b419b 100644 --- a/crates/nvisy-core/src/entity/annotation.rs +++ b/crates/nvisy-core/src/entity/annotation.rs @@ -25,7 +25,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::{AnnotationProvenance, Entity, EntityKind, TrailProvenance, TrailStep}; +use super::{AnnotationProvenance, Entity, EntityLabelRef, TrailProvenance, TrailStep, builtins}; use crate::modality::{Modality, Overlap}; use crate::primitive::Confidence; @@ -82,16 +82,14 @@ pub enum AnnotationStrength { pub enum AnnotationKind { /// Pre-identified region the user wants treated as sensitive. Inclusion { - /// Specific entity kind. `None` when the user wants the - /// region treated as sensitive without committing to a - /// kind — synthesised entities fall back to - /// [`EntityKind::Unresolved`]. The broad - /// [`EntityCategory`] is derived via - /// [`EntityKind::category`]. + /// Label to attach to the synthesised entity. `None` + /// when the user wants the region treated as sensitive + /// without committing to a kind — synthesised entities + /// then fall back to [`builtins::UNRESOLVED`]. /// - /// [`EntityCategory`]: super::EntityCategory - #[serde(skip_serializing_if = "Option::is_none")] - entity_kind: Option, + /// [`builtins::UNRESOLVED`]: super::builtins::UNRESOLVED + #[serde(default, skip_serializing_if = "Option::is_none")] + label: Option, /// Modality-specific location this inclusion targets. target: M::Location, /// Whether this is an advisory [`Hint`] (LLM may reject) or @@ -137,7 +135,7 @@ impl Annotation { /// [`Hint`]: AnnotationStrength::Hint pub fn to_inclusion_entity(&self) -> Option> { let AnnotationKind::Inclusion { - entity_kind, + label, target, strength: AnnotationStrength::Assert, } = &self.kind @@ -145,8 +143,12 @@ impl Annotation { return None; }; + let label_ref = label + .clone() + .unwrap_or_else(|| EntityLabelRef::from(builtins::UNRESOLVED.name.clone())); + let entity = Entity::builder() - .with_entity_kind(entity_kind.unwrap_or(EntityKind::Unresolved)) + .with_label(label_ref) .with_trail(vec![TrailStep::recognition( "annotation", Confidence::MAX, @@ -212,7 +214,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::entity::EntityCategory; + use crate::entity::builtins; use crate::modality::{Image, ImageLocation, Text, TextLocation}; use crate::primitive::BoundingBox; @@ -220,7 +222,7 @@ mod tests { Annotation { name: None, kind: AnnotationKind::Inclusion { - entity_kind: Some(EntityKind::PersonName), + label: Some(EntityLabelRef::from(builtins::PERSON_NAME.name.clone())), target: TextLocation::new(start, end), strength, }, @@ -263,14 +265,13 @@ mod tests { let ann: Annotation = Annotation { name: None, kind: AnnotationKind::Inclusion { - entity_kind: None, + label: None, target: TextLocation::new(0, 10), strength: AnnotationStrength::Assert, }, }; let entity = ann.to_inclusion_entity().unwrap(); - assert_eq!(entity.category(), EntityCategory::Unresolved); - assert_eq!(entity.entity_kind, EntityKind::Unresolved); + assert_eq!(entity.label.as_str(), builtins::UNRESOLVED.name.as_str()); } #[test] @@ -285,7 +286,7 @@ mod tests { let ann: Annotation = Annotation { name: Some("face".into()), kind: AnnotationKind::Inclusion { - entity_kind: Some(EntityKind::PersonName), + label: Some(EntityLabelRef::from(builtins::FACE.name.clone())), target: ImageLocation::new(bbox), strength: AnnotationStrength::Assert, }, diff --git a/crates/nvisy-core/src/entity/category.rs b/crates/nvisy-core/src/entity/category.rs deleted file mode 100644 index 652c0ef5..00000000 --- a/crates/nvisy-core/src/entity/category.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Broad entity category classification. -//! -//! [`EntityCategory`] groups related [`EntityKind`] -//! variants into policy-addressable buckets. Policy selectors can -//! target an entire category (e.g. "redact all financial data") without -//! enumerating individual kinds. -//! -//! [`EntityKind`]: super::EntityKind - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use strum::{Display, EnumString}; - -/// Broad category of sensitive data. -/// -/// Each [`EntityKind`] maps to exactly one category -/// via [`EntityKind::category()`]. -/// -/// [`EntityKind`]: super::EntityKind -/// [`EntityKind::category()`]: super::EntityKind::category -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] -#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -#[non_exhaustive] -pub enum EntityCategory { - /// Personal identity: names, government IDs, dates of birth, and - /// other attributes that directly identify a natural person. - PersonalIdentity, - /// Contact information: email addresses, phone numbers, physical - /// addresses, postal codes, and URLs. - ContactInfo, - /// Demographic attributes: age, gender, ethnicity, religion, - /// nationality, and citizenship. - Demographic, - /// Financial instruments and accounts: payment cards, bank - /// accounts, routing numbers, IBAN, crypto addresses, and - /// monetary amounts. - Financial, - /// Protected health information: medical record numbers, - /// insurance IDs, prescriptions, diagnoses, and medications. - Health, - /// Biometric identifiers: fingerprints, voiceprints, retina - /// scans, and facial geometry templates. - Biometric, - /// Secrets and credentials: passwords, API keys, authentication - /// tokens, and private cryptographic keys. - Credentials, - /// Network and device identifiers: IP addresses, MAC addresses, - /// device IDs, and usernames. - NetworkIdentifier, - /// Geographic and spatial data: GPS coordinates and geolocation - /// metadata. - Location, - /// Sensitive visual elements detected in images or video: - /// faces, handwriting, signatures, logos, and barcodes. - Visual, - /// Organizational identifiers: company names, departments, - /// facilities, and institutional reference numbers. - Organizational, - /// General-purpose entities surfaced by zero-shot models that - /// are not strictly PII but are routinely useful for policy - /// routing or document structuring: events, occupations, - /// products, quantities. - GeneralPurpose, - /// Fallback bucket for entities a recognizer flagged as sensitive - /// but could not place into a more specific category. Use sparingly - /// — every recognizer should prefer a precise category when one - /// exists. - #[default] - Unresolved, -} diff --git a/crates/nvisy-core/src/entity/kind.rs b/crates/nvisy-core/src/entity/kind.rs deleted file mode 100644 index c435de7c..00000000 --- a/crates/nvisy-core/src/entity/kind.rs +++ /dev/null @@ -1,508 +0,0 @@ -//! Concrete entity kind enumeration. -//! -//! [`EntityKind`] enumerates the types of sensitive data the platform -//! can detect or redact. Each variant maps to a stable `snake_case` -//! string for serialization and display, and to an [`EntityCategory`] -//! via [`EntityKind::category`]. - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use strum::{Display, EnumIter, EnumString, IntoEnumIterator}; - -use super::category::EntityCategory; - -/// Specific kind of sensitive entity detected or targeted for redaction. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash, Display)] -#[derive(EnumIter, EnumString, Serialize, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -#[non_exhaustive] -pub enum EntityKind { - // Personal identity - /// Person name (full, first, or last). - PersonName, - /// Date of birth. - DateOfBirth, - /// Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.). - GovernmentId, - /// Tax identification number (ITIN, EIN, TIN, etc.). - TaxId, - /// Driver's license number. - DriversLicense, - /// Passport number. - PassportNumber, - /// National insurance or social-security equivalent (NI, BSN, AHVN, etc.). - NationalInsuranceNumber, - /// Vehicle identification number (VIN). - VehicleId, - /// License plate number. - LicensePlate, - - // Contact information - /// Email address. - EmailAddress, - /// Phone number. - PhoneNumber, - /// Physical or mailing address. - Address, - /// Postal or ZIP code. - PostalCode, - /// URL or hyperlink. - Url, - - // Demographic - /// Age value. - Age, - /// Gender identity. - Gender, - /// Racial or ethnic background. - Ethnicity, - /// Religious affiliation. - Religion, - /// Nationality. - Nationality, - /// Citizenship status. - Citizenship, - /// Language or dialect spoken. - Language, - - // Financial - /// Payment card number (credit or debit). - PaymentCard, - /// Payment card security code (CVV/CVC). - CardSecurityCode, - /// Payment card expiration date. - CardExpiry, - /// Bank account number. - BankAccount, - /// Bank routing or transit number. - BankRouting, - /// International Bank Account Number (IBAN). - Iban, - /// SWIFT / BIC code. - SwiftCode, - /// Cryptocurrency wallet address. - CryptoAddress, - /// Currency name or ISO 4217 code (USD, US Dollar, EUR, BTC, - /// Bitcoin, …). Distinct from a concrete [`Amount`]. - /// - /// [`Amount`]: Self::Amount - Currency, - /// Monetary amount. - Amount, - - // Health - /// Medical or patient identifier. - MedicalId, - /// Insurance policy number. - InsuranceId, - /// Prescription number. - PrescriptionId, - /// Medical diagnosis or condition. - Diagnosis, - /// Drug or medication name in a patient context. - Medication, - - // Biometric - /// Fingerprint template or minutiae data. - Fingerprint, - /// Voiceprint or speaker embedding. - Voiceprint, - /// Retina or iris scan data. - RetinaScan, - /// Facial geometry or face embedding (not a photo: see [`Face`]). - /// - /// [`Face`]: Self::Face - FacialGeometry, - - // Credentials - /// Password or passphrase. - Password, - /// API key. - ApiKey, - /// Authentication or session token. - AuthToken, - /// Private cryptographic key. - PrivateKey, - - // Network and device identifiers - /// IP address (v4 or v6). - IpAddress, - /// MAC (hardware) address. - MacAddress, - /// Device identifier (IMEI, IDFA, etc.). - DeviceId, - /// Username or online handle. - Username, - - // Location - /// GPS coordinates (latitude / longitude). - Coordinates, - /// Geolocation metadata (EXIF, cell tower, etc.). - GeolocationMetadata, - - // Visual - /// Detected human face in an image. - Face, - /// Handwritten text region. - Handwriting, - /// Handwritten or digital signature. - Signature, - /// Logo or brand mark. - Logo, - /// Barcode (1D) or QR code (2D). - Barcode, - - // Organizational - /// Company or institution name. - OrganizationName, - /// Internal division or department name. - DepartmentName, - /// Physical facility name (hospital, office, school). - FacilityName, - /// Legal or administrative case identifier. - CaseNumber, - /// Internal reference number (invoice, contract, PO, employee number, membership ID). - InternalId, - - // Temporal - /// Date, time, or datetime value. - DateTime, - - // General-purpose NER labels (commonly emitted by zero-shot - // models like GLiNER): not strictly PII but useful to flag for - // policy routing, redaction overrides, or downstream - // structuring. - /// Event reference (conferences, weddings, public happenings). - Event, - /// Occupation, role, or job title. - Occupation, - /// Product, service, or model name. - Product, - /// Numeric quantity or measurement (distinct from monetary - /// [`Amount`]). - /// - /// [`Amount`]: Self::Amount - Quantity, - - /// Fallback kind for entities a recognizer flagged as sensitive - /// but could not classify into a more specific kind. Pairs with - /// [`EntityCategory::Unresolved`]. - #[default] - Unresolved, -} - -impl EntityKind { - /// Every defined [`EntityKind`] variant, in declaration order. - /// - /// Use with combinators to build category-filtered allowlists - /// without enumerating variants by hand: - /// - /// ```ignore - /// let text_kinds: Vec = EntityKind::all() - /// .filter(|k| !k.is_biometric() && !k.is_visual()) - /// .collect(); - /// ``` - pub fn all() -> impl Iterator { - ::iter() - } - - /// Returns the [`EntityCategory`] this entity kind belongs to. - pub fn category(&self) -> EntityCategory { - match self { - // Personal identity - Self::PersonName - | Self::DateOfBirth - | Self::GovernmentId - | Self::TaxId - | Self::DriversLicense - | Self::PassportNumber - | Self::NationalInsuranceNumber - | Self::VehicleId - | Self::LicensePlate => EntityCategory::PersonalIdentity, - - // Contact - Self::EmailAddress - | Self::PhoneNumber - | Self::Address - | Self::PostalCode - | Self::Url => EntityCategory::ContactInfo, - - // Demographic - Self::Age - | Self::Gender - | Self::Ethnicity - | Self::Religion - | Self::Nationality - | Self::Citizenship - | Self::Language => EntityCategory::Demographic, - - // Financial - Self::PaymentCard - | Self::CardSecurityCode - | Self::CardExpiry - | Self::BankAccount - | Self::BankRouting - | Self::Iban - | Self::SwiftCode - | Self::CryptoAddress - | Self::Currency - | Self::Amount => EntityCategory::Financial, - - // Health - Self::MedicalId - | Self::InsuranceId - | Self::PrescriptionId - | Self::Diagnosis - | Self::Medication => EntityCategory::Health, - - // Biometric - Self::Fingerprint | Self::Voiceprint | Self::RetinaScan | Self::FacialGeometry => { - EntityCategory::Biometric - } - - // Credentials - Self::Password | Self::ApiKey | Self::AuthToken | Self::PrivateKey => { - EntityCategory::Credentials - } - - // Network - Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { - EntityCategory::NetworkIdentifier - } - - // Location - Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Location, - - // Visual - Self::Face | Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => { - EntityCategory::Visual - } - - // Organizational - Self::OrganizationName - | Self::DepartmentName - | Self::FacilityName - | Self::CaseNumber - | Self::InternalId => EntityCategory::Organizational, - - // Temporal (grouped under PersonalIdentity: bare dates most - // commonly appear alongside personal data and are regulated - // as PII by GDPR/CCPA) - Self::DateTime => EntityCategory::PersonalIdentity, - - // General-purpose - Self::Event | Self::Occupation | Self::Product | Self::Quantity => { - EntityCategory::GeneralPurpose - } - - Self::Unresolved => EntityCategory::Unresolved, - } - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::PersonalIdentity`]. - #[must_use] - pub fn is_personal_identity(&self) -> bool { - self.category() == EntityCategory::PersonalIdentity - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::ContactInfo`]. - #[must_use] - pub fn is_contact_info(&self) -> bool { - self.category() == EntityCategory::ContactInfo - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Demographic`]. - #[must_use] - pub fn is_demographic(&self) -> bool { - self.category() == EntityCategory::Demographic - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Financial`]. - #[must_use] - pub fn is_financial(&self) -> bool { - self.category() == EntityCategory::Financial - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Health`]. - #[must_use] - pub fn is_health(&self) -> bool { - self.category() == EntityCategory::Health - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Biometric`]. - #[must_use] - pub fn is_biometric(&self) -> bool { - self.category() == EntityCategory::Biometric - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Credentials`]. - #[must_use] - pub fn is_credentials(&self) -> bool { - self.category() == EntityCategory::Credentials - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::NetworkIdentifier`]. - #[must_use] - pub fn is_network_identifier(&self) -> bool { - self.category() == EntityCategory::NetworkIdentifier - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Location`]. - #[must_use] - pub fn is_location(&self) -> bool { - self.category() == EntityCategory::Location - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Visual`]. - #[must_use] - pub fn is_visual(&self) -> bool { - self.category() == EntityCategory::Visual - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Organizational`]. - #[must_use] - pub fn is_organizational(&self) -> bool { - self.category() == EntityCategory::Organizational - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::GeneralPurpose`]. - #[must_use] - pub fn is_general_purpose(&self) -> bool { - self.category() == EntityCategory::GeneralPurpose - } - - /// Convenience predicate: this kind has a recognisable - /// structural shape — fixed character-class layout, separators - /// at meaningful positions, fixed length — that anonymizers - /// and validators may want to preserve verbatim. - /// - /// True for: IBAN, payment cards (number / CVV / expiry), - /// bank accounts and routing, SWIFT/BIC, postal codes, phone - /// numbers, email addresses, dates and date-times, IP and MAC - /// addresses, license plates, coordinates. False for free-form - /// names, addresses, occupations, tokens, etc. - #[must_use] - pub fn is_structured(&self) -> bool { - matches!( - self, - Self::PaymentCard - | Self::CardSecurityCode - | Self::CardExpiry - | Self::Iban - | Self::BankAccount - | Self::BankRouting - | Self::SwiftCode - | Self::PostalCode - | Self::PhoneNumber - | Self::EmailAddress - | Self::DateOfBirth - | Self::DateTime - | Self::IpAddress - | Self::MacAddress - | Self::LicensePlate - | Self::Coordinates - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn category_personal_identity() { - assert_eq!( - EntityKind::GovernmentId.category(), - EntityCategory::PersonalIdentity - ); - assert_eq!( - EntityKind::PersonName.category(), - EntityCategory::PersonalIdentity - ); - assert_eq!( - EntityKind::DateOfBirth.category(), - EntityCategory::PersonalIdentity - ); - } - - #[test] - fn category_contact_info() { - assert_eq!( - EntityKind::EmailAddress.category(), - EntityCategory::ContactInfo - ); - assert_eq!(EntityKind::Address.category(), EntityCategory::ContactInfo); - } - - #[test] - fn category_demographic() { - assert_eq!(EntityKind::Gender.category(), EntityCategory::Demographic); - assert_eq!( - EntityKind::Ethnicity.category(), - EntityCategory::Demographic - ); - assert_eq!(EntityKind::Religion.category(), EntityCategory::Demographic); - } - - #[test] - fn category_financial() { - assert_eq!( - EntityKind::PaymentCard.category(), - EntityCategory::Financial - ); - assert_eq!(EntityKind::Iban.category(), EntityCategory::Financial); - } - - #[test] - fn category_health() { - assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Health); - assert_eq!(EntityKind::Diagnosis.category(), EntityCategory::Health); - assert_eq!(EntityKind::Medication.category(), EntityCategory::Health); - } - - #[test] - fn category_credentials() { - assert_eq!(EntityKind::Password.category(), EntityCategory::Credentials); - assert_eq!(EntityKind::ApiKey.category(), EntityCategory::Credentials); - } - - #[test] - fn category_biometric() { - assert_eq!( - EntityKind::Fingerprint.category(), - EntityCategory::Biometric - ); - assert_eq!(EntityKind::Voiceprint.category(), EntityCategory::Biometric); - assert_eq!(EntityKind::RetinaScan.category(), EntityCategory::Biometric); - assert_eq!(EntityKind::Face.category(), EntityCategory::Visual); - } - - #[test] - fn category_organizational() { - assert_eq!( - EntityKind::OrganizationName.category(), - EntityCategory::Organizational - ); - assert_eq!( - EntityKind::CaseNumber.category(), - EntityCategory::Organizational - ); - assert_eq!( - EntityKind::InternalId.category(), - EntityCategory::Organizational - ); - } -} diff --git a/crates/nvisy-core/src/entity/label/builtins.rs b/crates/nvisy-core/src/entity/label/builtins.rs new file mode 100644 index 00000000..8c7bd9d9 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/builtins.rs @@ -0,0 +1,178 @@ +//! Built-in [`EntityLabel`] constants. +//! +//! Each constant carries a category tag (`personal_identity`, +//! `financial`, …) plus cross-cutting tags where applicable +//! (`pii`, `phi`, `pci`). Selectors can match by label name *or* +//! by tag without the workspace modelling categories as a +//! separate enum. +//! +//! The `BUILT_INS` slice indexes every constant for the +//! [`EntityLabelCatalog::with_builtins`] constructor; the +//! constants themselves are public and reachable by name (e.g. +//! `builtins::PERSON_NAME`). +//! +//! [`EntityLabelCatalog::with_builtins`]: super::EntityLabelCatalog::with_builtins + +use std::sync::LazyLock; + +use super::entity_label::EntityLabel; + +macro_rules! label { + ($vis:vis $ident:ident, $name:literal, $desc:literal, [ $($tag:literal),* $(,)? ]) => { + $vis static $ident: LazyLock = LazyLock::new(|| { + EntityLabel::from_static($name, Some($desc), &[$($tag),*]) + }); + }; +} + +label!(pub PERSON_NAME, "person_name","Person name (full, first, or last).", ["personal_identity", "pii"]); +label!(pub DATE_OF_BIRTH, "date_of_birth","Date of birth.", ["personal_identity", "pii"]); +label!(pub GOVERNMENT_ID, "government_id","Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.).", ["personal_identity", "pii"]); +label!(pub TAX_ID, "tax_id","Tax identification number (ITIN, EIN, TIN, etc.).", ["personal_identity", "pii"]); +label!(pub DRIVERS_LICENSE, "drivers_license","Driver's license number.", ["personal_identity", "pii"]); +label!(pub PASSPORT_NUMBER, "passport_number","Passport number.", ["personal_identity", "pii"]); +label!(pub NATIONAL_INSURANCE_NUMBER, "national_insurance_number","National insurance or social-security equivalent (NI, BSN, AHVN, etc.).", ["personal_identity", "pii"]); +label!(pub VEHICLE_ID, "vehicle_id","Vehicle identification number (VIN).", ["personal_identity"]); +label!(pub LICENSE_PLATE, "license_plate","License plate number.", ["personal_identity"]); +label!(pub EMAIL_ADDRESS, "email_address","Email address.", ["contact_info", "pii"]); +label!(pub PHONE_NUMBER, "phone_number","Phone number.", ["contact_info", "pii"]); +label!(pub ADDRESS, "address","Physical or mailing address.", ["contact_info", "pii"]); +label!(pub POSTAL_CODE, "postal_code","Postal or ZIP code.", ["contact_info"]); +label!(pub URL, "url","URL or hyperlink.", ["contact_info"]); +label!(pub AGE, "age","Age value.", ["demographic", "pii"]); +label!(pub GENDER, "gender","Gender identity.", ["demographic", "pii"]); +label!(pub ETHNICITY, "ethnicity","Racial or ethnic background.", ["demographic", "pii"]); +label!(pub RELIGION, "religion","Religious affiliation.", ["demographic", "pii"]); +label!(pub NATIONALITY, "nationality","Nationality.", ["demographic", "pii"]); +label!(pub CITIZENSHIP, "citizenship","Citizenship status.", ["demographic", "pii"]); +label!(pub LANGUAGE, "language","Language or dialect spoken.", ["demographic"]); +label!(pub PAYMENT_CARD, "payment_card","Payment card number (credit or debit).", ["financial", "pci", "pii"]); +label!(pub CARD_SECURITY_CODE, "card_security_code","Payment card security code (CVV/CVC).", ["financial", "pci"]); +label!(pub CARD_EXPIRY, "card_expiry","Payment card expiration date.", ["financial", "pci"]); +label!(pub BANK_ACCOUNT, "bank_account","Bank account number.", ["financial", "pii"]); +label!(pub BANK_ROUTING, "bank_routing","Bank routing or transit number.", ["financial"]); +label!(pub IBAN, "iban","International Bank Account Number (IBAN).", ["financial", "pii"]); +label!(pub SWIFT_CODE, "swift_code","SWIFT/BIC code.", ["financial"]); +label!(pub CRYPTO_ADDRESS, "crypto_address","Cryptocurrency wallet address.", ["financial", "pii"]); +label!(pub CURRENCY, "currency","Currency code or symbol.", ["financial"]); +label!(pub AMOUNT, "amount","Monetary amount.", ["financial"]); +label!(pub MEDICAL_ID, "medical_id","Medical record number.", ["health", "phi", "pii"]); +label!(pub INSURANCE_ID, "insurance_id","Health insurance identifier.", ["health", "phi", "pii"]); +label!(pub PRESCRIPTION_ID, "prescription_id","Prescription identifier or medication regimen.", ["health", "phi"]); +label!(pub DIAGNOSIS, "diagnosis","Medical diagnosis or condition.", ["health", "phi"]); +label!(pub MEDICATION, "medication","Medication name.", ["health", "phi"]); +label!(pub FINGERPRINT, "fingerprint","Fingerprint biometric data.", ["biometric", "pii"]); +label!(pub VOICEPRINT, "voiceprint","Voiceprint biometric data.", ["biometric", "pii"]); +label!(pub RETINA_SCAN, "retina_scan","Retina scan biometric data.", ["biometric", "pii"]); +label!(pub FACIAL_GEOMETRY, "facial_geometry","Facial geometry biometric data.", ["biometric", "pii"]); +label!(pub PASSWORD, "password","Password.", ["credentials", "secret"]); +label!(pub API_KEY, "api_key","API key.", ["credentials", "secret"]); +label!(pub AUTH_TOKEN, "auth_token","Authentication token (OAuth, JWT, session token).", ["credentials", "secret"]); +label!(pub PRIVATE_KEY, "private_key","Private cryptographic key.", ["credentials", "secret"]); +label!(pub IP_ADDRESS, "ip_address","IP address (v4 or v6).", ["network_identifier", "pii"]); +label!(pub MAC_ADDRESS, "mac_address","MAC address.", ["network_identifier", "pii"]); +label!(pub DEVICE_ID, "device_id","Device identifier (IMEI, UDID, etc.).", ["network_identifier", "pii"]); +label!(pub USERNAME, "username","Username or handle.", ["network_identifier", "pii"]); +label!(pub COORDINATES, "coordinates","GPS coordinates (latitude/longitude).", ["location", "pii"]); +label!(pub GEOLOCATION_METADATA, "geolocation_metadata","Geolocation metadata.", ["location", "pii"]); +label!(pub FACE, "face","Human face detected in an image or video frame.", ["visual", "pii"]); +label!(pub HANDWRITING, "handwriting","Handwritten text.", ["visual"]); +label!(pub SIGNATURE, "signature","Handwritten signature.", ["visual", "pii"]); +label!(pub LOGO, "logo","Brand or organisation logo.", ["visual"]); +label!(pub BARCODE, "barcode","Barcode or QR code.", ["visual"]); +label!(pub ORGANIZATION_NAME, "organization_name","Organization or company name.", ["organization"]); +label!(pub DEPARTMENT_NAME, "department_name","Department or business-unit name.", ["organization"]); +label!(pub FACILITY_NAME, "facility_name","Physical facility or location name.", ["organization"]); +label!(pub CASE_NUMBER, "case_number","Case, matter, or docket number.", ["organization"]); +label!(pub INTERNAL_ID, "internal_id","Operator-defined internal identifier.", ["organization"]); +label!(pub DATE_TIME, "date_time","Date or time value.", ["temporal"]); +label!(pub EVENT, "event","Named event reference.", ["temporal"]); +label!(pub OCCUPATION, "occupation","Occupation or job title.", ["organization"]); +label!(pub PRODUCT, "product","Product name.", ["organization"]); +label!(pub QUANTITY, "quantity","Numerical quantity.", ["quantity"]); +label!(pub UNRESOLVED, "unresolved","Entity kind not yet identified.", ["unresolved"]); + +/// Every built-in label constant, indexed for catalog construction. +pub(super) static BUILT_INS: &[&LazyLock] = &[ + &PERSON_NAME, + &DATE_OF_BIRTH, + &GOVERNMENT_ID, + &TAX_ID, + &DRIVERS_LICENSE, + &PASSPORT_NUMBER, + &NATIONAL_INSURANCE_NUMBER, + &VEHICLE_ID, + &LICENSE_PLATE, + &EMAIL_ADDRESS, + &PHONE_NUMBER, + &ADDRESS, + &POSTAL_CODE, + &URL, + &AGE, + &GENDER, + ÐNICITY, + &RELIGION, + &NATIONALITY, + &CITIZENSHIP, + &LANGUAGE, + &PAYMENT_CARD, + &CARD_SECURITY_CODE, + &CARD_EXPIRY, + &BANK_ACCOUNT, + &BANK_ROUTING, + &IBAN, + &SWIFT_CODE, + &CRYPTO_ADDRESS, + &CURRENCY, + &AMOUNT, + &MEDICAL_ID, + &INSURANCE_ID, + &PRESCRIPTION_ID, + &DIAGNOSIS, + &MEDICATION, + &FINGERPRINT, + &VOICEPRINT, + &RETINA_SCAN, + &FACIAL_GEOMETRY, + &PASSWORD, + &API_KEY, + &AUTH_TOKEN, + &PRIVATE_KEY, + &IP_ADDRESS, + &MAC_ADDRESS, + &DEVICE_ID, + &USERNAME, + &COORDINATES, + &GEOLOCATION_METADATA, + &FACE, + &HANDWRITING, + &SIGNATURE, + &LOGO, + &BARCODE, + &ORGANIZATION_NAME, + &DEPARTMENT_NAME, + &FACILITY_NAME, + &CASE_NUMBER, + &INTERNAL_ID, + &DATE_TIME, + &EVENT, + &OCCUPATION, + &PRODUCT, + &QUANTITY, + &UNRESOLVED, +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn well_known_built_ins_have_expected_tags() { + assert_eq!(PAYMENT_CARD.name, "payment_card"); + assert!(PAYMENT_CARD.has_tag("financial")); + assert!(PAYMENT_CARD.has_tag("pci")); + assert!(PAYMENT_CARD.has_tag("pii")); + assert_eq!(PERSON_NAME.name, "person_name"); + assert!(PERSON_NAME.has_tag("personal_identity")); + } +} diff --git a/crates/nvisy-core/src/entity/label/catalog.rs b/crates/nvisy-core/src/entity/label/catalog.rs new file mode 100644 index 00000000..89f02d12 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/catalog.rs @@ -0,0 +1,115 @@ +//! [`EntityLabelCatalog`] — name-indexed lookup over a set of +//! [`EntityLabel`]s. +//! +//! Constructed at runtime configuration time. Recognizers' +//! supported labels and selectors' tag-matching path both walk a +//! `EntityLabelCatalog`. The workspace ships a built-in catalog through +//! [`EntityLabelCatalog::with_builtins`]; deployments can register their own +//! labels alongside or instead of the built-ins via +//! [`EntityLabelCatalog::with_label`] / [`EntityLabelCatalog::with_labels`]. + +use std::collections::HashMap; +use std::sync::LazyLock; + +use hipstr::HipStr; + +use super::builtins::BUILT_INS; +use super::entity_label::EntityLabel; + +/// Name-indexed catalog of [`EntityLabel`]s. +/// +/// Built from a list of labels (mixing workspace-shipped built-ins +/// with deployment-defined custom labels). Construction copies each +/// label into a [`HashMap`] keyed by `HipStr` clone of the label's +/// name; subsequent lookups are O(1). +#[derive(Debug, Clone, Default)] +pub struct EntityLabelCatalog { + by_name: HashMap, EntityLabel>, +} + +impl EntityLabelCatalog { + /// Empty catalog. Built-ins must be registered explicitly via + /// [`Self::with_label`] / [`Self::with_labels`]; use + /// [`Self::with_builtins`] for the workspace-shipped set. + pub fn new() -> Self { + Self::default() + } + + /// EntityLabelCatalog pre-populated with every workspace-shipped built-in + /// label. + pub fn with_builtins() -> Self { + let mut cat = Self::new(); + for lazy in BUILT_INS { + cat.insert(LazyLock::force(lazy).clone()); + } + cat + } + + /// Register a single label. Replaces any prior entry sharing + /// the same [`EntityLabel::name`]. + pub fn insert(&mut self, label: EntityLabel) { + self.by_name.insert(label.name.clone(), label); + } + + /// Builder-style sibling of [`Self::insert`] returning `Self`. + #[must_use] + pub fn with_label(mut self, label: EntityLabel) -> Self { + self.insert(label); + self + } + + /// Bulk-register a sequence of labels. + #[must_use] + pub fn with_labels(mut self, labels: I) -> Self + where + I: IntoIterator, + { + for l in labels { + self.insert(l); + } + self + } + + /// Look up a label by name. Returns `None` for names not + /// registered in this catalog. + pub fn lookup(&self, name: &str) -> Option<&EntityLabel> { + self.by_name.get(name) + } + + /// Iterator over every registered label, in no particular order. + pub fn iter(&self) -> impl Iterator + '_ { + self.by_name.values() + } + + /// Number of labels in the catalog. + pub fn len(&self) -> usize { + self.by_name.len() + } + + /// `true` when the catalog is empty. + pub fn is_empty(&self) -> bool { + self.by_name.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builtin_catalog_resolves_known_names() { + let cat = EntityLabelCatalog::with_builtins(); + let l = cat.lookup("payment_card").expect("built-in"); + assert!(l.has_tag("financial")); + assert!(cat.lookup("acme_internal_id").is_none()); + } + + #[test] + fn catalog_accepts_custom_labels_alongside_builtins() { + let custom = EntityLabel::new("acme_internal_id").with_tags(["custom"]); + let cat = EntityLabelCatalog::with_builtins().with_label(custom); + assert!(cat.lookup("payment_card").is_some()); + let acme = cat.lookup("acme_internal_id").expect("custom registered"); + assert!(acme.has_tag("custom")); + } +} diff --git a/crates/nvisy-core/src/entity/label/entity_label.rs b/crates/nvisy-core/src/entity/label/entity_label.rs new file mode 100644 index 00000000..2503bf4a --- /dev/null +++ b/crates/nvisy-core/src/entity/label/entity_label.rs @@ -0,0 +1,176 @@ +//! [`EntityLabel`] — open vocabulary tag for detected entities. +//! +//! Any recognizer can mint a label by name, ship it through the +//! pipeline, and have the audit reference it verbatim. The workspace +//! ships a catalog of built-in labels in [`super::builtins`]; +//! recognizers and policy authors are free to invent new ones +//! (`acme-internal-id`, `medical-record-no`) without touching +//! workspace code. +//! +//! ## Identity +//! +//! Labels are identified by [`name`]; two labels with the same name +//! are considered the same entity kind regardless of differences in +//! [`description`] or [`tags`]. Selectors match by name. +//! +//! ## Tags +//! +//! [`tags`] is a free-form list of short identifiers that policy +//! selectors can match against. Built-in labels carry category +//! tags (`personal_identity`, `contact_info`, `financial`, etc.) +//! plus cross-cutting tags (`pii`, `phi`, `pci`). Custom labels +//! can ship with zero tags; selectors targeting tags only match +//! labels that carry them. +//! +//! [`name`]: EntityLabel::name +//! [`description`]: EntityLabel::description +//! [`tags`]: EntityLabel::tags + +use hipstr::HipStr; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::EntityLabelRef; + +/// Open-vocabulary entity label: identity, optional description, +/// and zero or more tags. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct EntityLabel { + /// Canonical name of the label (e.g. `"person_name"`, + /// `"acme_internal_id"`). Selectors match by this value. + #[schemars(with = "String")] + pub name: HipStr<'static>, + /// Optional human-readable description of what the label + /// represents. Surfaced in audits and policy author tooling. + #[serde(default, skip_serializing_if = "Option::is_none")] + #[schemars(with = "Option")] + pub description: Option>, + /// Free-form tags grouping this label with related ones. + /// Built-in labels carry category tags + /// (`personal_identity`, `financial`, …) plus cross-cutting + /// tags where applicable (`pii`, `phi`, `pci`). Empty for + /// untagged custom labels. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + #[schemars(with = "Vec")] + pub tags: Vec>, +} + +impl EntityLabel { + /// Construct a label from a runtime name. Description and tags + /// default to empty; use [`Self::with_description`] and + /// [`Self::with_tags`] to add them. + pub fn new(name: impl Into>) -> Self { + Self { + name: name.into(), + description: None, + tags: Vec::new(), + } + } + + /// Construct a label entirely from `&'static str` literals. + /// Used by the built-in catalog in [`super::builtins`] so the + /// strings live in static storage and runtime construction is + /// just one `Vec::from` per built-in. + pub fn from_static( + name: &'static str, + description: Option<&'static str>, + tags: &'static [&'static str], + ) -> Self { + Self { + name: HipStr::from_static(name), + description: description.map(HipStr::from_static), + tags: tags.iter().copied().map(HipStr::from_static).collect(), + } + } + + /// Attach a description. + #[must_use] + pub fn with_description(mut self, description: impl Into>) -> Self { + self.description = Some(description.into()); + self + } + + /// Attach tags. Replaces any previously set tags. + #[must_use] + pub fn with_tags(mut self, tags: I) -> Self + where + I: IntoIterator, + S: Into>, + { + self.tags = tags.into_iter().map(Into::into).collect(); + self + } + + /// Returns `true` when this label carries `tag` in its tag + /// list. Comparison is byte-for-byte. + #[must_use] + pub fn has_tag(&self, tag: &str) -> bool { + self.tags.iter().any(|t| t == tag) + } + + /// Construct a name-only [`EntityLabelRef`] handle to this + /// label. Clones the underlying [`HipStr`] (a refcount bump + /// for `from_static` labels — no allocation). + /// + /// [`EntityLabelRef`]: super::EntityLabelRef + #[must_use] + pub fn label_ref(&self) -> EntityLabelRef { + EntityLabelRef::from(self.name.clone()) + } +} + +impl AsRef for EntityLabel { + fn as_ref(&self) -> &str { + &self.name + } +} + +impl std::fmt::Display for EntityLabel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.name, f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_static_round_trips() { + let l = EntityLabel::from_static( + "email_address", + Some("Email address."), + &["contact_info", "pii"], + ); + assert_eq!(l.name, "email_address"); + assert_eq!(l.description.as_deref(), Some("Email address.")); + assert!(l.has_tag("contact_info")); + assert!(l.has_tag("pii")); + assert!(!l.has_tag("financial")); + } + + #[test] + fn builder_setters_chain() { + let l = EntityLabel::new("acme_internal_id") + .with_description("ACME corp internal record id") + .with_tags(["custom", "acme"]); + assert_eq!(l.name, "acme_internal_id"); + assert_eq!( + l.description.as_deref(), + Some("ACME corp internal record id"), + ); + assert!(l.has_tag("acme")); + } + + #[test] + fn equality_ignores_metadata() { + // NOTE: deliberately *not* the behaviour today — `derive(PartialEq)` + // makes equality structural. If selectors need name-only equality + // they should compare `.name` explicitly. This test documents the + // current contract so a future change is intentional. + let a = EntityLabel::new("person_name").with_tags(["pii"]); + let b = EntityLabel::new("person_name"); + assert_ne!(a, b); + } +} diff --git a/crates/nvisy-core/src/entity/label/mod.rs b/crates/nvisy-core/src/entity/label/mod.rs new file mode 100644 index 00000000..8fc9f743 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/mod.rs @@ -0,0 +1,31 @@ +//! Entity label types and catalog. +//! +//! Four concerns split across this folder: +//! +//! - [`EntityLabel`] — full catalog entry (name + description + +//! tags). Authored once per label; consumed by selectors and +//! audit-rendering tooling that need the metadata. +//! - [`EntityLabelRef`] — name-only handle stored on every +//! detected [`Entity`]. Cheap-clone wrapper around +//! [`HipStr<'static>`]. +//! - [`EntityLabelCatalog`] — name-indexed lookup over a +//! collection of `EntityLabel`s. The workspace ships a built-in +//! catalog constructed from [`EntityLabelCatalog::with_builtins`]; +//! consumers can register custom labels alongside or instead of +//! the built-ins. +//! - [`builtins`] — every built-in `EntityLabel` constant +//! (`builtins::PERSON_NAME`, `builtins::EMAIL_ADDRESS`, …) plus +//! the internal `BUILT_INS` slice the catalog walks at +//! construction time. +//! +//! [`Entity`]: crate::entity::Entity +//! [`HipStr<'static>`]: hipstr::HipStr + +pub mod builtins; +mod catalog; +mod entity_label; +mod reference; + +pub use self::catalog::EntityLabelCatalog; +pub use self::entity_label::EntityLabel; +pub use self::reference::EntityLabelRef; diff --git a/crates/nvisy-core/src/entity/label/reference.rs b/crates/nvisy-core/src/entity/label/reference.rs new file mode 100644 index 00000000..43cfea6d --- /dev/null +++ b/crates/nvisy-core/src/entity/label/reference.rs @@ -0,0 +1,148 @@ +//! [`EntityLabelRef`] — name-only handle to an [`EntityLabel`]. +//! +//! Per-entity hot paths (`Entity::label`, audit refs, selector +//! matching) carry only the label's identifying name, not its full +//! catalog metadata. [`EntityLabelRef`] wraps a [`HipStr<'static>`] +//! so the surface is a single newtype rather than a bare string, +//! giving us a typed receiver for ergonomics like +//! `entity.label.matches("payment_card")`. +//! +//! Catalog-side metadata (description, tags) lives on +//! [`EntityLabel`] and is dereferenced through +//! [`EntityLabelCatalog::lookup`] when a consumer needs it. +//! +//! [`EntityLabel`]: super::EntityLabel +//! [`EntityLabelCatalog::lookup`]: super::EntityLabelCatalog::lookup + +use std::fmt; + +use hipstr::HipStr; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Name-only handle to an [`EntityLabel`]. Cheap-clone wrapper +/// around [`HipStr<'static>`]. +/// +/// Carried on every [`Entity`] in place of the full catalog +/// metadata. Two refs are equal when their names are equal +/// byte-for-byte. +/// +/// [`EntityLabel`]: super::EntityLabel +/// [`Entity`]: crate::entity::Entity +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(transparent)] +#[schemars(with = "String")] +pub struct EntityLabelRef(HipStr<'static>); + +impl EntityLabelRef { + /// Wrap a name. + pub fn new(name: impl Into>) -> Self { + Self(name.into()) + } + + /// Wrap a `&'static str` without allocating. + #[must_use] + pub const fn from_static(name: &'static str) -> Self { + Self(HipStr::from_static(name)) + } + + /// Borrow the underlying name. + #[must_use] + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Borrow the inner [`HipStr`]. + #[must_use] + pub fn as_hipstr(&self) -> &HipStr<'static> { + &self.0 + } + + /// Consume the ref and return the inner [`HipStr`]. + #[must_use] + pub fn into_hipstr(self) -> HipStr<'static> { + self.0 + } + + /// `true` when this ref names `name` byte-for-byte. + #[must_use] + pub fn matches(&self, name: &str) -> bool { + self.0 == name + } +} + +impl AsRef for EntityLabelRef { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl fmt::Display for EntityLabelRef { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +impl From> for EntityLabelRef { + fn from(value: HipStr<'static>) -> Self { + Self(value) + } +} + +impl From for HipStr<'static> { + fn from(value: EntityLabelRef) -> Self { + value.0 + } +} + +impl From<&'static str> for EntityLabelRef { + fn from(value: &'static str) -> Self { + Self::from_static(value) + } +} + +impl From for EntityLabelRef { + fn from(value: String) -> Self { + Self(HipStr::from(value)) + } +} + +impl PartialEq for EntityLabelRef { + fn eq(&self, other: &str) -> bool { + self.0 == other + } +} + +impl PartialEq<&str> for EntityLabelRef { + fn eq(&self, other: &&str) -> bool { + self.0 == *other + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_static_no_alloc() { + let r = EntityLabelRef::from_static("payment_card"); + assert_eq!(r.as_str(), "payment_card"); + assert!(r.matches("payment_card")); + assert!(!r.matches("person_name")); + } + + #[test] + fn equality_with_str() { + let r = EntityLabelRef::from_static("email_address"); + assert_eq!(r, "email_address"); + } + + #[test] + fn serde_transparent() { + let r = EntityLabelRef::from_static("ssn"); + let s = serde_json::to_string(&r).unwrap(); + assert_eq!(s, "\"ssn\""); + let back: EntityLabelRef = serde_json::from_str(&s).unwrap(); + assert_eq!(back, r); + } +} diff --git a/crates/nvisy-core/src/entity/method/provenance.rs b/crates/nvisy-core/src/entity/method/provenance.rs index f2cf834e..712f154d 100644 --- a/crates/nvisy-core/src/entity/method/provenance.rs +++ b/crates/nvisy-core/src/entity/method/provenance.rs @@ -5,10 +5,7 @@ use serde::{Deserialize, Serialize}; /// Provenance for a pattern-based detection (regex, dictionary, /// deny-list). Each variant carries only the fields meaningful for -/// that matcher — the old flat `PatternKind` + `Option` -/// representation allowed invalid combinations (a `Regex` row with -/// no pattern name, a `DenyList` row with a stale validator) that -/// can't be constructed in this shape. +/// that matcher. #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-core/src/entity/mod.rs b/crates/nvisy-core/src/entity/mod.rs index 3c23f273..efef6d5f 100644 --- a/crates/nvisy-core/src/entity/mod.rs +++ b/crates/nvisy-core/src/entity/mod.rs @@ -17,8 +17,7 @@ //! [`AuditEntry`]: https://docs.rs/nvisy-engine/latest/nvisy_engine/provenance/struct.AuditEntry.html mod annotation; -mod category; -mod kind; +pub mod label; mod method; mod source; @@ -30,8 +29,7 @@ use uuid::Uuid; pub use self::annotation::{ Annotation, AnnotationKind, AnnotationStrength, LabelAnnotation, is_excluded, }; -pub use self::category::EntityCategory; -pub use self::kind::EntityKind; +pub use self::label::{EntityLabel, EntityLabelCatalog, EntityLabelRef, builtins}; pub use self::method::{ AnnotationProvenance, ModelProvenance, PatternProvenance, TrailProvenance, TrailStep, TrailStepKind, @@ -44,11 +42,8 @@ use crate::primitive::{Confidence, LanguageTag}; /// A detected sensitive data occurrence within a document. /// -/// The category for an entity is derived from its [`entity_kind`] via -/// [`EntityKind::category`]; it is not stored separately. The trail -/// of score-affecting steps lives on [`trail`]. +/// The trail of score-affecting steps lives on [`trail`]. /// -/// [`entity_kind`]: Self::entity_kind /// [`trail`]: Self::trail #[derive(Debug, Clone, PartialEq, Builder)] #[derive(Serialize, Deserialize, JsonSchema)] @@ -68,9 +63,11 @@ pub struct Entity { #[builder(default, setter(into = false))] #[serde(default, skip_serializing_if = "Option::is_none")] pub entity_id: Option, - /// Specific entity kind. The broad [`EntityCategory`] is derived - /// via [`Entity::category`]. - pub entity_kind: EntityKind, + /// Open-vocabulary classification of the entity. Wraps the + /// label's identifying name; full catalog metadata + /// (description, tags) is dereferenced through an + /// [`EntityLabelCatalog`]. + pub label: EntityLabelRef, /// Modality-specific location of the entity within the document. pub location: M::Location, /// Detection confidence score in the range `[0.0, 1.0]`. Equals @@ -100,12 +97,6 @@ impl Entity { EntityBuilder::default() } - /// Derived broad classification — `self.entity_kind.category()`. - #[must_use] - pub fn category(&self) -> EntityCategory { - self.entity_kind.category() - } - /// Original recognition score, before any post-recognition /// adjustments. Reads from the first step's `original` (or /// `adjusted` if it had none), returning `None` only if the @@ -142,7 +133,9 @@ impl Entity { pub fn test_builder(start: usize, end: usize) -> EntityBuilder { let conf = Confidence::clamped(0.9); Entity::builder() - .with_entity_kind(EntityKind::PersonName) + .with_label(EntityLabelRef::from( + self::builtins::PERSON_NAME.name.clone(), + )) .with_trail(vec![TrailStep::recognition( "pattern", conf, diff --git a/crates/nvisy-core/src/primitive/confidence/value.rs b/crates/nvisy-core/src/primitive/confidence/value.rs index 52d8c706..540fc7e4 100644 --- a/crates/nvisy-core/src/primitive/confidence/value.rs +++ b/crates/nvisy-core/src/primitive/confidence/value.rs @@ -200,5 +200,4 @@ mod tests { assert!(serde_json::from_str::("1.4").is_err()); assert!(serde_json::from_str::("-0.2").is_err()); } - } diff --git a/crates/nvisy-core/src/recognition/hint.rs b/crates/nvisy-core/src/recognition/hint.rs index 081149eb..1f9fa9c7 100644 --- a/crates/nvisy-core/src/recognition/hint.rs +++ b/crates/nvisy-core/src/recognition/hint.rs @@ -18,7 +18,7 @@ //! [`Entity`]: crate::entity::Entity //! [`RecognizerInput::hints`]: super::RecognizerInput::hints -use crate::entity::EntityKind; +use crate::entity::EntityLabelRef; use crate::modality::Modality; /// Uploader-supplied annotation region in modality-native @@ -29,20 +29,22 @@ pub struct Hint { /// confirm or relocate this hint forward the name into the /// emitted entity's recognition trail step. pub name: Option, - /// Uploader-claimed entity kind (optional). - pub entity_kind: Option, + /// Uploader-claimed label (optional). When set, recognizers + /// that confirm the hint stamp this on the emitted entity's + /// `label` field. + pub label: Option, /// Region in modality-native coordinates. pub location: M::Location, } impl Hint { - /// Construct a hint with only the location set; name and kind + /// Construct a hint with only the location set; name and label /// default to `None`. #[must_use] pub fn new(location: M::Location) -> Self { Self { name: None, - entity_kind: None, + label: None, location, } } @@ -54,10 +56,10 @@ impl Hint { self } - /// Attach an uploader-claimed entity kind. + /// Attach an uploader-claimed label. #[must_use] - pub fn with_entity_kind(mut self, entity_kind: EntityKind) -> Self { - self.entity_kind = Some(entity_kind); + pub fn with_label(mut self, label: impl Into) -> Self { + self.label = Some(label.into()); self } } diff --git a/crates/nvisy-core/src/recognition/label_map.rs b/crates/nvisy-core/src/recognition/label_map.rs index cb742dd5..b19fcd34 100644 --- a/crates/nvisy-core/src/recognition/label_map.rs +++ b/crates/nvisy-core/src/recognition/label_map.rs @@ -1,98 +1,114 @@ -//! [`LabelMap`]: model-label → [`EntityKind`] translation table. +//! [`LabelMap`]: backend label → canonical label-name translation. //! //! Shared translation table used by every model-driven recognizer //! (NER backends, LLM recognizers, …). Lets a recognizer consume //! raw model labels uniformly regardless of which backend produced //! them — swap backends without re-implementing translation. //! -//! The map is bidirectional in spirit (look up an [`EntityKind`] to -//! find the canonical label a backend should be asked for) but the -//! primary path is label→kind. The reverse lookup is a linear -//! scan; if a future backend needs frequent reverse lookups we'll -//! cache both directions. +//! The map is bidirectional in spirit (look up an entity label +//! name to find the canonical model label a backend should be asked +//! for) but the primary path is model-label → entity-label-name. +//! The reverse lookup is a linear scan. use std::borrow::Cow; use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use strum::IntoEnumIterator; -use crate::entity::EntityKind; +use crate::entity::{EntityLabelCatalog, EntityLabelRef}; -/// Translation table from raw model labels to canonical -/// [`EntityKind`] values. +/// Translation table from raw model labels to entity label names. /// -/// The default ([`LabelMap::canonical`]) maps every `EntityKind`'s -/// snake_case string form to itself, so backends that already -/// return canonical labels (the Bento `inference-gliner` today) -/// pass through unchanged. Custom backends register their own -/// model-specific labels via [`with_entry`]. +/// The default ([`LabelMap::canonical`]) maps every name in a +/// [`EntityLabelCatalog`] to itself, so backends that already return canonical +/// names pass through unchanged. Custom backends register their +/// own model-specific labels via [`with_entry`]. /// /// [`with_entry`]: Self::with_entry #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] #[serde(transparent)] pub struct LabelMap { - entries: HashMap, + entries: HashMap, } impl LabelMap { /// Empty map. Backends with no recognizable labels see every - /// span dropped — typically you want - /// [`canonical`] instead. - /// - /// [`canonical`]: Self::canonical + /// span dropped — typically you want [`Self::canonical`] / + /// [`Self::canonical_from`] instead. #[must_use] pub fn new() -> Self { Self::default() } - /// Identity map: every [`EntityKind`] mapped to its own - /// canonical snake_case label. Use when the backend already - /// returns canonical labels. + /// Identity map from every label name in the workspace + /// built-in [`EntityLabelCatalog`] to itself. Convenience wrapper around + /// [`Self::canonical_from`] for callers that don't need custom + /// labels. #[must_use] pub fn canonical() -> Self { - let entries = EntityKind::iter() - .map(|kind| (kind.to_string(), kind)) + Self::canonical_from(&EntityLabelCatalog::with_builtins()) + } + + /// Identity map over every name in the supplied catalog. + /// Backends that already return canonical names — or that + /// have been pre-registered with the catalog — pass through + /// unchanged. + #[must_use] + pub fn canonical_from(catalog: &EntityLabelCatalog) -> Self { + let entries = catalog + .iter() + .map(|label| { + ( + label.name.to_string(), + EntityLabelRef::from(label.name.clone()), + ) + }) .collect(); Self { entries } } - /// Register one label→kind entry. Last write wins on duplicate - /// labels. + /// Register one model-label → entity-label-ref entry. Last + /// write wins on duplicates. #[must_use] - pub fn with_entry(mut self, label: impl Into>, kind: EntityKind) -> Self { - self.entries.insert(label.into().into_owned(), kind); + pub fn with_entry( + mut self, + model_label: impl Into>, + entity_label: impl Into, + ) -> Self { + self.entries + .insert(model_label.into().into_owned(), entity_label.into()); self } /// Register many entries. #[must_use] - pub fn with_entries(mut self, entries: I) -> Self + pub fn with_entries(mut self, entries: I) -> Self where - I: IntoIterator, - S: Into, + I: IntoIterator, + K: Into, + V: Into, { - for (label, kind) in entries { - self.entries.insert(label.into(), kind); + for (model_label, entity_label) in entries { + self.entries.insert(model_label.into(), entity_label.into()); } self } - /// Look up a raw label. `None` when not registered. + /// Look up a raw model label. `None` when not registered. #[must_use] - pub fn lookup(&self, label: &str) -> Option { - self.entries.get(label).copied() + pub fn lookup(&self, model_label: &str) -> Option<&EntityLabelRef> { + self.entries.get(model_label) } - /// Find a label string that maps to `kind`. Linear scan; - /// returns the first match. Used by zero-shot backends that - /// need to format requested-kinds as raw labels for the - /// service. + /// Find a model label that maps to the given entity label + /// name. Linear scan; returns the first match. Used by + /// zero-shot backends that need to format requested-labels as + /// raw model labels for the service. #[must_use] - pub fn label_for(&self, kind: EntityKind) -> Option<&str> { + pub fn model_label_for(&self, entity_label: &str) -> Option<&str> { self.entries .iter() - .find_map(|(label, k)| (*k == kind).then_some(label.as_str())) + .find_map(|(m, e)| (e.as_str() == entity_label).then_some(m.as_str())) } /// Number of registered entries. @@ -111,26 +127,30 @@ impl LabelMap { #[cfg(test)] mod tests { use super::*; + use crate::entity::builtins; #[test] fn canonical_map_resolves_known_labels() { let map = LabelMap::canonical(); - assert_eq!(map.lookup("email_address"), Some(EntityKind::EmailAddress)); - assert_eq!(map.lookup("ssn"), None); + assert_eq!( + map.lookup("email_address").map(|r| r.as_str()), + Some("email_address") + ); + assert!(map.lookup("ssn").is_none()); } #[test] fn custom_entries_override_canonical() { - let map = LabelMap::canonical().with_entry("PER", EntityKind::PersonName); - assert_eq!(map.lookup("PER"), Some(EntityKind::PersonName)); + let map = LabelMap::canonical().with_entry( + "PER", + EntityLabelRef::from(builtins::PERSON_NAME.name.clone()), + ); + assert_eq!(map.lookup("PER").map(|r| r.as_str()), Some("person_name")); } #[test] - fn label_for_round_trips() { + fn model_label_for_round_trips() { let map = LabelMap::canonical(); - assert_eq!( - map.label_for(EntityKind::EmailAddress), - Some("email_address") - ); + assert_eq!(map.model_label_for("email_address"), Some("email_address")); } } diff --git a/crates/nvisy-engine/src/core/config.rs b/crates/nvisy-engine/src/core/config.rs new file mode 100644 index 00000000..a5918188 --- /dev/null +++ b/crates/nvisy-engine/src/core/config.rs @@ -0,0 +1,184 @@ +//! Engine-wide configuration, typically deserialized from TOML. +//! +//! [`RuntimeConfig`] is the top-level configuration object containing +//! optional subsystem sections — [`EngineConfig`], +//! [`ExtractionConfig`][ec], [`DetectionConfig`][dc], +//! [`RedactionConfig`][rc]. +//! +//! Per-request plan nodes (`Extraction`, `Redaction`, +//! `DeduplicationParams`, `Validation`) live alongside their +//! corresponding side's config in [`crate::detection`] or +//! [`crate::redaction`]; the per-phase modules consume them at +//! dispatch time. +//! +//! # Post-load steps +//! +//! After deserializing from TOML, callers should: +//! 1. Call [`RuntimeConfig::resolve_env`] to fill empty `api_key` +//! fields from environment variables. +//! 2. Call [`RuntimeConfig::validate`] to check structural +//! constraints. +//! +//! [ec]: crate::detection::ExtractionConfig +//! [dc]: crate::detection::DetectionConfig +//! [rc]: crate::redaction::RedactionConfig + +use std::num::NonZeroUsize; +use std::time::Duration; + +use nvisy_core::Error; +use nvisy_core::Result; +use nvisy_llm::backend::http::HttpConfig; +use semver::Version; +use serde::{Deserialize, Serialize}; +use validator::Validate; + +use crate::detection::{DetectionConfig, ExtractionConfig}; +use crate::redaction::RedactionConfig; + +fn default_config_version() -> Version { + Version::new(0, 1, 0) +} + +/// Top-level pipeline configuration, typically deserialized from TOML. +/// +/// Contains optional subsystem sections. The CLI layer owns the full +/// TOML shape (including `[server]`) and passes this struct downstream +/// to the engines. +/// +/// Every section is load-once: the engines read this struct at +/// startup, build the per-section state behind `Arc`s on their inner +/// shared state, and never re-read it. Per-request override is not +/// supported — plans tune behaviour through their own per-phase +/// config nodes, not by resupplying a `RuntimeConfig`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RuntimeConfig { + /// Configuration schema version. + #[serde(default = "default_config_version")] + pub version: Version, + + /// Engine-level execution policies, networking, and resource + /// limits. + pub engine: Option, + /// Extraction registry — `[extractor.ocr]`, `[extractor.stt]` + /// sub-sections. Built once at engine startup; the `Extraction` + /// phase config carries per-call flags only. + pub extraction: Option, + /// Detection registry — `[detection.pattern]`, + /// `[detection.ner]` sub-sections. Built once at engine startup; + /// the `Detection` phase config only references these by kind. + pub detection: Option, + /// Deployment-wide redaction defaults — `[redaction]` section. + /// Built once at engine startup; the per-plan `Redaction` node + /// falls back to these for any `None` fields. Per-request + /// override is not supported. + pub redaction: Option, +} + +impl Default for RuntimeConfig { + fn default() -> Self { + Self { + version: default_config_version(), + engine: None, + extraction: None, + detection: None, + redaction: None, + } + } +} + +impl RuntimeConfig { + /// Resource limits from the engine section, or defaults. + #[must_use] + pub fn effective_limits(&self) -> ResourceLimits { + self.engine.as_ref().map(|e| e.limits).unwrap_or_default() + } + + /// Concurrency limit from the engine section's resource limits, + /// if configured. + #[must_use] + pub fn effective_concurrency(&self) -> Option { + self.engine.as_ref().and_then(|e| e.limits.concurrency) + } + + /// Validate all configuration sections. + /// + /// Checks structural constraints (e.g. retry/timeout ranges) + /// using the `validator` crate. Should be called once after + /// deserialization and after any merge. + /// + /// # Errors + /// + /// Returns a validation error listing all constraint violations. + pub fn validate(&self) -> Result<(), Error> { + if let Some(ref engine) = self.engine { + engine + .validate() + .map_err(|e| Error::validation(format!("engine: {e}"), "config"))?; + } + Ok(()) + } + + /// Resolve `api_key` fields from environment variables. + /// + /// Placeholder: per-extractor/per-recognizer provider configs + /// will get their own env-var resolution path in a follow-up. + pub fn resolve_env(&mut self) {} +} + +/// Engine-level configuration: networking + resource limits. +/// +/// All settings are deployment-side — set once in `Nvisy.toml` by +/// the operator. Per-request overrides apply only to fields +/// explicitly noted as overridable. +#[derive(Debug, Clone, Default, Validate, Serialize, Deserialize)] +pub struct EngineConfig { + /// Shared HTTP client configuration for all downstream API calls. + /// + /// Applies to OCR providers, LLM agents, STT services, and any + /// other external HTTP dependencies. Controls timeouts, retries, + /// and connection pooling. + pub http: Option, + + /// Run-level resource limits (concurrency cap + timeout). + /// + /// Nested under `[engine.limits]` in TOML. + #[validate(nested)] + #[serde(default)] + pub limits: ResourceLimits, +} + +/// Hard limits on pipeline resource consumption. +/// +/// Deployment-side caps applied to every +/// [`DetectionEngine::detect`][de] / [`RedactionEngine::redact`][re] +/// pass — not adjustable per-request. +/// +/// [de]: crate::detection::DetectionEngine::detect +/// [re]: crate::redaction::RedactionEngine::redact +#[derive(Debug, Clone, Copy, Default, Validate, Serialize, Deserialize)] +pub struct ResourceLimits { + /// Maximum number of documents processed in parallel via a + /// shared [`Semaphore`]. Server-wide; not overridable + /// per-request. `None` means unbounded. + /// + /// [`Semaphore`]: tokio::sync::Semaphore + #[serde(default)] + pub concurrency: Option, + + /// Hard ceiling on total pipeline run duration. + /// + /// On expiry, the run-level cancellation token fires and the + /// run is marked as timed out. `None` means no run-level + /// timeout — rely on external supervision (k8s liveness, etc.) + /// instead. + /// + /// Parses from human-friendly strings via `humantime_serde`: + /// `"60s"`, `"5m"`, `"1h30m"`. + #[serde( + default, + with = "humantime_serde", + skip_serializing_if = "Option::is_none" + )] + pub run_timeout: Option, +} diff --git a/crates/nvisy-engine/src/core/context.rs b/crates/nvisy-engine/src/core/context.rs index 550295c7..3dfdbccd 100644 --- a/crates/nvisy-engine/src/core/context.rs +++ b/crates/nvisy-engine/src/core/context.rs @@ -1,19 +1,18 @@ -//! [`RunContext`]: per-run shared state every [`Phase`] reads from. +//! Per-pass execution contexts: [`DetectionContext`] and +//! [`RedactionContext`]. //! -//! Built once per run by the pipeline orchestrator from the -//! deployment-wide [`RuntimeConfig`] and the per-request -//! [`EngineInput`]; borrowed read-only by every [`PhaseContext`] in -//! the phase loop. +//! Each side carries only the engine resources it actually consumes +//! — detection needs the extractor and recognizer registries, +//! redaction needs the redaction config and per-modality +//! anonymizer registries. The fields they share — run-wide +//! [`SharedData`], cancellation, concurrency cap — are exposed +//! through the [`PhaseContext`] trait so modality-agnostic phases +//! (deduplication, validation) can borrow either context type +//! without duplication. //! -//! Lives in `core/` (not `pipeline/`) because phases consume it -//! through `ctx.run.X` — it's part of the phase contract surface, -//! not the orchestrator's private state. The orchestrator constructs -//! it; phases read from it; nothing else mutates it. -//! -//! [`Phase`]: super::Phase -//! [`PhaseContext`]: super::PhaseContext -//! [`RuntimeConfig`]: crate::pipeline::RuntimeConfig -//! [`EngineInput`]: crate::pipeline::EngineInput +//! Lives in `core/` because phases consume these types through +//! their `ctx` parameter — they're part of the phase contract +//! surface, not the orchestrator's private state. use std::num::NonZeroUsize; use std::sync::Arc; @@ -23,103 +22,143 @@ use nvisy_toolkit::extraction::ExtractorRegistry; use tokio_util::sync::CancellationToken; use super::SharedData; -use crate::phases::redaction::RedactionRegistries; -use crate::pipeline::RedactionConfig; - -/// Per-run execution context shared across all document tasks. -/// -/// Engines and configs are held by value (not wrapped in `Arc`) -/// because below the top-level [`EngineInner`] singleton there's -/// no sharing — `RunContext` is built per-run, lives for that run, -/// then hands each phase its own copy via the pipeline orchestrator. -/// The engines themselves internally hold `Arc`-wrapped recognizers / -/// extractors, so cloning them is a few atomic increments. -/// -/// [`EngineInner`]: crate::pipeline::Engine -pub struct RunContext { - /// Token to signal cancellation to all tasks. +use crate::redaction::phases::RedactionRegistries; +use crate::redaction::RedactionConfig; + +/// Shared surface every phase reads from regardless of which side +/// (detection or redaction) it runs on. Implemented by both +/// [`DetectionContext`] and [`RedactionContext`]; modality-agnostic +/// phases (deduplication, validation) bound on this trait so they +/// can borrow either context type. +pub trait PhaseContext { + /// Run-wide shared state (policies, registry, key provider, + /// per-request entity-label catalog). + fn shared(&self) -> &Arc; + /// `true` when this run's cancellation token has fired. + fn is_cancelled(&self) -> bool; + /// Optional document-concurrency cap from `[engine.limits]`. + fn concurrency(&self) -> Option; +} + +/// Per-pass detection context. Detection-side phases (extraction, +/// detection, deduplication) consume it; redaction-side phases +/// never see it. +pub struct DetectionContext { pub(crate) cancel: CancellationToken, - /// Shared run-wide state: run ID, actor, registry, policies. pub(crate) shared: Arc, - /// Pre-built extractor registry. pub(crate) extraction_engine: ExtractorRegistry, - /// Pre-built recognizer registry. Always present; when no - /// recognizers are registered the per-modality dispatch - /// short-circuits on the empty list. Shared via `Arc` so - /// per-document phases hold a cheap handle without cloning - /// the underlying recognizer lists. + /// Per-request recognizer registry — built fresh from the + /// engine-side detection-config template plus the request's + /// label catalog. pub(crate) recognizer_registry: Arc, - /// Server-wide redaction defaults. Per-plan `Redaction` fields - /// fall back to these. - pub(crate) redaction_config: RedactionConfig, - /// Per-modality custom-anonymizer registries. Populated by - /// deployment code at engine startup; empty when no custom - /// operators are registered (only built-in redaction specs from - /// policies are then resolvable). - pub(crate) redaction_registries: RedactionRegistries, - /// Optional limit on how many documents may process concurrently. pub(crate) concurrency: Option, } -/// Bundle of the four toolkit-shaped engine resources a -/// [`RunContext`] borrows from the pipeline orchestrator. Passed as -/// one argument so [`RunContext::new`] stays narrow as new engine -/// resources land. -pub(crate) struct RunEngines { +/// Engine resources the detection pipeline borrows when building a +/// [`DetectionContext`]. Bundled into one type so adding new +/// detection-side engines doesn't widen `DetectionContext::new`'s +/// signature. +pub(crate) struct DetectionEngines { pub extraction_engine: ExtractorRegistry, pub recognizer_registry: Arc, - pub redaction_config: RedactionConfig, - pub redaction_registries: RedactionRegistries, } -impl RunContext { - /// Construct a [`RunContext`] from its parts. Called once per - /// pass by a per-subsystem orchestrator. +impl DetectionContext { + /// Construct a [`DetectionContext`] from its parts. Called once + /// per pass by the detection pipeline. pub(crate) fn new( cancel: CancellationToken, shared: Arc, - engines: RunEngines, + engines: DetectionEngines, concurrency: Option, ) -> Self { - let RunEngines { + let DetectionEngines { extraction_engine, recognizer_registry, - redaction_config, - redaction_registries, } = engines; Self { cancel, shared, extraction_engine, recognizer_registry, - redaction_config, - redaction_registries, concurrency, } } - /// True when this run's cancellation token has fired. - pub(crate) fn is_cancelled(&self) -> bool { - self.cancel.is_cancelled() - } - /// Pre-built extraction engine borrowed by [`ExtractionPhase`]. /// - /// [`ExtractionPhase`]: crate::pipeline::ExtractionPhase + /// [`ExtractionPhase`]: crate::detection::phases::extraction::ExtractionPhase pub(crate) fn extraction_engine(&self) -> &ExtractorRegistry { &self.extraction_engine } - /// Pre-built recognizer registry borrowed by [`DetectionPhase`]. + /// Per-request recognizer registry borrowed by + /// [`DetectionPhase`]. /// - /// [`DetectionPhase`]: crate::pipeline::DetectionPhase + /// [`DetectionPhase`]: crate::detection::phases::detection::DetectionPhase pub(crate) fn recognizer_registry(&self) -> &Arc { &self.recognizer_registry } +} + +impl PhaseContext for DetectionContext { + fn shared(&self) -> &Arc { + &self.shared + } + + fn is_cancelled(&self) -> bool { + self.cancel.is_cancelled() + } + + fn concurrency(&self) -> Option { + self.concurrency + } +} + +/// Per-pass redaction context. Redaction-side phases (redaction, +/// validation) consume it; detection-side phases never see it. +pub struct RedactionContext { + pub(crate) cancel: CancellationToken, + pub(crate) shared: Arc, + pub(crate) redaction_config: RedactionConfig, + pub(crate) redaction_registries: RedactionRegistries, + pub(crate) concurrency: Option, +} + +/// Engine resources the redaction pipeline borrows when building a +/// [`RedactionContext`]. Bundled into one type so adding new +/// redaction-side engines doesn't widen `RedactionContext::new`'s +/// signature. +pub(crate) struct RedactionEngines { + pub redaction_config: RedactionConfig, + pub redaction_registries: RedactionRegistries, +} + +impl RedactionContext { + /// Construct a [`RedactionContext`] from its parts. Called once + /// per pass by the redaction pipeline. + pub(crate) fn new( + cancel: CancellationToken, + shared: Arc, + engines: RedactionEngines, + concurrency: Option, + ) -> Self { + let RedactionEngines { + redaction_config, + redaction_registries, + } = engines; + Self { + cancel, + shared, + redaction_config, + redaction_registries, + concurrency, + } + } /// Server-wide redaction defaults the [`RedactionPhase`] reads. /// - /// [`RedactionPhase`]: crate::pipeline::RedactionPhase + /// [`RedactionPhase`]: crate::redaction::phases::RedactionPhase pub(crate) fn redaction_config(&self) -> &RedactionConfig { &self.redaction_config } @@ -127,18 +166,22 @@ impl RunContext { /// Per-modality custom-anonymizer registries the /// [`RedactionPhase`] consults for `Custom`-arm lookups. /// - /// [`RedactionPhase`]: crate::pipeline::RedactionPhase + /// [`RedactionPhase`]: crate::redaction::phases::RedactionPhase pub(crate) fn redaction_registries(&self) -> &RedactionRegistries { &self.redaction_registries } +} - /// Run-wide shared state (policies, registry, key provider). - pub(crate) fn shared(&self) -> &Arc { +impl PhaseContext for RedactionContext { + fn shared(&self) -> &Arc { &self.shared } - /// Optional document-concurrency cap from `[engine.limits]`. - pub(crate) fn concurrency(&self) -> Option { + fn is_cancelled(&self) -> bool { + self.cancel.is_cancelled() + } + + fn concurrency(&self) -> Option { self.concurrency } } diff --git a/crates/nvisy-engine/src/phases/ingestion/compression/mod.rs b/crates/nvisy-engine/src/core/ingestion/compression/mod.rs similarity index 97% rename from crates/nvisy-engine/src/phases/ingestion/compression/mod.rs rename to crates/nvisy-engine/src/core/ingestion/compression/mod.rs index a20b4e52..53de4f18 100644 --- a/crates/nvisy-engine/src/phases/ingestion/compression/mod.rs +++ b/crates/nvisy-engine/src/core/ingestion/compression/mod.rs @@ -9,7 +9,7 @@ use bytes::Bytes; use nvisy_core::{Error, Result}; -use crate::phases::ingestion::CompressionAlgorithm; +use crate::core::ingestion::CompressionAlgorithm; const TARGET: &str = "nvisy_engine::op::compression"; diff --git a/crates/nvisy-engine/src/phases/ingestion/encryption/mod.rs b/crates/nvisy-engine/src/core/ingestion/encryption/mod.rs similarity index 100% rename from crates/nvisy-engine/src/phases/ingestion/encryption/mod.rs rename to crates/nvisy-engine/src/core/ingestion/encryption/mod.rs diff --git a/crates/nvisy-engine/src/phases/ingestion/encryption/provider.rs b/crates/nvisy-engine/src/core/ingestion/encryption/provider.rs similarity index 100% rename from crates/nvisy-engine/src/phases/ingestion/encryption/provider.rs rename to crates/nvisy-engine/src/core/ingestion/encryption/provider.rs diff --git a/crates/nvisy-engine/src/phases/ingestion/encryption/service.rs b/crates/nvisy-engine/src/core/ingestion/encryption/service.rs similarity index 98% rename from crates/nvisy-engine/src/phases/ingestion/encryption/service.rs rename to crates/nvisy-engine/src/core/ingestion/encryption/service.rs index bf9943f0..4d67193a 100644 --- a/crates/nvisy-engine/src/phases/ingestion/encryption/service.rs +++ b/crates/nvisy-engine/src/core/ingestion/encryption/service.rs @@ -9,7 +9,7 @@ use rand::RngExt; use super::provider::{KeyProvider, SharedKeyProvider}; use super::wire::{EncryptedContent, NONCE_SIZE, WireEnvelope}; -use crate::phases::ingestion::EncryptionAlgorithm; +use crate::core::ingestion::EncryptionAlgorithm; const TARGET: &str = "nvisy_engine::op::encryption"; @@ -140,7 +140,7 @@ mod tests { use nvisy_codec::content::{ContentData, ContentSource}; use super::*; - use crate::phases::ingestion::encryption::{SharedKeyProvider, StaticKeyProvider}; + use crate::core::ingestion::encryption::{SharedKeyProvider, StaticKeyProvider}; fn test_key_provider() -> SharedKeyProvider { let key = vec![0xAB; 32]; diff --git a/crates/nvisy-engine/src/phases/ingestion/encryption/wire.rs b/crates/nvisy-engine/src/core/ingestion/encryption/wire.rs similarity index 99% rename from crates/nvisy-engine/src/phases/ingestion/encryption/wire.rs rename to crates/nvisy-engine/src/core/ingestion/encryption/wire.rs index 95b56666..83dbe4b9 100644 --- a/crates/nvisy-engine/src/phases/ingestion/encryption/wire.rs +++ b/crates/nvisy-engine/src/core/ingestion/encryption/wire.rs @@ -22,7 +22,7 @@ use bytes::Bytes; use nvisy_codec::content::ContentSource; use nvisy_core::{Error, Result}; -use crate::phases::ingestion::EncryptionAlgorithm; +use crate::core::ingestion::EncryptionAlgorithm; /// Wire-format magic bytes identifying an Nvisy encrypted blob. const MAGIC: &[u8; 4] = b"NVSE"; diff --git a/crates/nvisy-engine/src/phases/ingestion/export.rs b/crates/nvisy-engine/src/core/ingestion/export.rs similarity index 95% rename from crates/nvisy-engine/src/phases/ingestion/export.rs rename to crates/nvisy-engine/src/core/ingestion/export.rs index 4c45afa0..7e2fae09 100644 --- a/crates/nvisy-engine/src/phases/ingestion/export.rs +++ b/crates/nvisy-engine/src/core/ingestion/export.rs @@ -17,7 +17,7 @@ use super::{CompressionAlgorithm, EncryptionConfig}; /// Identifies the destination content objects and specifies any encoding /// steps that must be applied before the bytes are written out. /// -/// [`ExportFile`]: crate::phases::ingestion::ExportFile +/// [`ExportFile`]: crate::core::ingestion::ExportFile #[derive(Debug, Clone, Default, PartialEq, Eq, Validate)] #[derive(Serialize, Deserialize, JsonSchema)] pub struct ExportFile { diff --git a/crates/nvisy-engine/src/phases/ingestion/exporter.rs b/crates/nvisy-engine/src/core/ingestion/exporter.rs similarity index 94% rename from crates/nvisy-engine/src/phases/ingestion/exporter.rs rename to crates/nvisy-engine/src/core/ingestion/exporter.rs index 00f7c9b6..6bede14b 100644 --- a/crates/nvisy-engine/src/phases/ingestion/exporter.rs +++ b/crates/nvisy-engine/src/core/ingestion/exporter.rs @@ -14,9 +14,9 @@ use uuid::Uuid; use crate::core::{AnyTree, DocumentTree, SharedData}; use crate::modality::DocumentModality; -use crate::phases::ingestion::compression::CompressionService; -use crate::phases::ingestion::encryption::CryptoService; -use crate::phases::ingestion::{CompressionAlgorithm, EncryptionConfig}; +use crate::core::ingestion::compression::CompressionService; +use crate::core::ingestion::encryption::CryptoService; +use crate::core::ingestion::{CompressionAlgorithm, EncryptionConfig}; const TARGET: &str = "nvisy_engine::op::export_file"; diff --git a/crates/nvisy-engine/src/phases/ingestion/import.rs b/crates/nvisy-engine/src/core/ingestion/import.rs similarity index 95% rename from crates/nvisy-engine/src/phases/ingestion/import.rs rename to crates/nvisy-engine/src/core/ingestion/import.rs index 362e29e7..d7b56408 100644 --- a/crates/nvisy-engine/src/phases/ingestion/import.rs +++ b/crates/nvisy-engine/src/core/ingestion/import.rs @@ -17,7 +17,7 @@ use super::{CompressionAlgorithm, EncryptionConfig}; /// Identifies the content objects to load and specifies any decoding steps /// that must be applied before the bytes are passed to extraction nodes. /// -/// [`ImportFile`]: crate::phases::ingestion::ImportFile +/// [`ImportFile`]: crate::core::ingestion::ImportFile #[derive(Debug, Clone, Default, PartialEq, Eq, Validate)] #[derive(Serialize, Deserialize, JsonSchema)] pub struct ImportFile { diff --git a/crates/nvisy-engine/src/phases/ingestion/importer.rs b/crates/nvisy-engine/src/core/ingestion/importer.rs similarity index 97% rename from crates/nvisy-engine/src/phases/ingestion/importer.rs rename to crates/nvisy-engine/src/core/ingestion/importer.rs index ef0fbe3e..321c3a05 100644 --- a/crates/nvisy-engine/src/phases/ingestion/importer.rs +++ b/crates/nvisy-engine/src/core/ingestion/importer.rs @@ -37,9 +37,9 @@ use crate::modality::{ AudioExtraction, AudioMetadata, DocumentModality, ImageExtraction, ImageMetadata, TabularExtraction, TabularMetadata, TextExtraction, TextMetadata, }; -use crate::phases::ingestion::compression::CompressionService; -use crate::phases::ingestion::encryption::{CryptoService, EncryptedContent}; -use crate::phases::ingestion::{CompressionAlgorithm, EncryptionAlgorithm, EncryptionConfig}; +use crate::core::ingestion::compression::CompressionService; +use crate::core::ingestion::encryption::{CryptoService, EncryptedContent}; +use crate::core::ingestion::{CompressionAlgorithm, EncryptionAlgorithm, EncryptionConfig}; const TARGET: &str = "nvisy_engine::op::import_file"; diff --git a/crates/nvisy-engine/src/phases/ingestion/mod.rs b/crates/nvisy-engine/src/core/ingestion/mod.rs similarity index 100% rename from crates/nvisy-engine/src/phases/ingestion/mod.rs rename to crates/nvisy-engine/src/core/ingestion/mod.rs diff --git a/crates/nvisy-engine/src/core/mod.rs b/crates/nvisy-engine/src/core/mod.rs index e9ba9e99..080ac4af 100644 --- a/crates/nvisy-engine/src/core/mod.rs +++ b/crates/nvisy-engine/src/core/mod.rs @@ -8,8 +8,13 @@ //! //! # Contents //! -//! - [`RunContext`] — per-run shared state (engines, policies, -//! cancellation). +//! - [`RuntimeConfig`] + [`EngineConfig`] + [`ResourceLimits`] — +//! engine-wide deployment configuration shared between the +//! detection and redaction engines. +//! - [`DetectionContext`] / [`RedactionContext`] — per-pass +//! execution contexts, each carrying only the engine resources +//! its side actually consumes. Both implement [`PhaseContext`] +//! for the shared surface modality-agnostic phases bound on. //! - [`SharedData`] — `Arc`-wrapped run-wide state (registry, //! codecs, policies). //! - [`PolicyStore`] — per-modality policy storage + matching. @@ -26,8 +31,10 @@ //! [`DataAt`]: nvisy_core::extraction::DataAt //! [`Healthcheck`]: nvisy_core::health::Healthcheck +mod config; mod context; mod health; +pub mod ingestion; mod policy_store; mod shared; mod target; @@ -35,8 +42,9 @@ mod tree; pub use nvisy_core::extraction::TextAt; -pub use self::context::RunContext; -pub(crate) use self::context::RunEngines; +pub use self::config::{EngineConfig, ResourceLimits, RuntimeConfig}; +pub use self::context::{DetectionContext, PhaseContext, RedactionContext}; +pub(crate) use self::context::{DetectionEngines, RedactionEngines}; pub use self::health::probe_all; pub(crate) use self::policy_store::Decision; pub use self::policy_store::PolicyStore; diff --git a/crates/nvisy-engine/src/core/policy_store.rs b/crates/nvisy-engine/src/core/policy_store.rs index 14b95157..60c251f3 100644 --- a/crates/nvisy-engine/src/core/policy_store.rs +++ b/crates/nvisy-engine/src/core/policy_store.rs @@ -1,181 +1,208 @@ -//! [`PolicyStore`]: heterogeneous container of [`Policy`] keyed by -//! modality, backed by a [`TypeMap`], plus the per-entity decision -//! resolver that walks it. +//! [`PolicyStore`]: per-request flat container of [`Arc`], +//! plus the per-entity decision resolver that walks it. //! -//! Built from a `Vec` submission via -//! [`PolicyStore::from_any_policies`], which consumes the submitted -//! policies and wraps each in an `Arc>` — no deep clones. +//! Built from a `Vec` submission via +//! [`PolicyStore::from_policies`], which consumes the submitted +//! policies and wraps each in an `Arc` — no deep clones. //! Detection and redaction pipelines share a single store via //! `Arc`; per-call handoff is a refcount bump. //! -//! Internally one `Vec>>` is stored per modality; -//! lookups cost a single `TypeId` hash. The only crate-public -//! operation is [`PolicyStore::resolve`]. +//! Policies are flat (no per-modality bucketing); each rule's +//! `Action::Redact(operators)` is itself modality-aware via +//! [`ModalityRedactions::operator_for`]. `resolve::` walks every +//! policy and uses the per-modality projection to pick the operator +//! for the entity's modality, falling back to the deployment-wide +//! defaults when the rule didn't cover that modality. //! //! [`SharedData`]: super::SharedData +//! [`ModalityRedactions::operator_for`]: crate::policy::redaction::ModalityRedactions::operator_for use std::sync::Arc; use hipstr::HipStr; use nvisy_codec::content::ContentDescriptor; use nvisy_core::entity::Entity; -use type_map::concurrent::TypeMap; use crate::modality::DocumentModality; -use crate::policy::{Action, AnyPolicy, Condition, Policy, PolicyRule}; +use crate::policy::redaction::{ModalityRedactions, ProjectRedaction}; +use crate::policy::{Action, Condition, Policy, PolicyRule}; -/// Heterogeneous container of policies across all modalities, -/// stored as `Arc>` so handoff between detection and -/// redaction pipelines is a refcount bump rather than a deep clone. -/// -/// # Type-safe per-modality storage -/// -/// Backed by `type_map::TypeMap`, which stores at most one value -/// per concrete type. The crate-internal `resolve::` method -/// looks the right bucket up by `TypeId`, so adding a new modality -/// is purely an `AnyPolicy::NewM(...)` arm in the crate-internal -/// constructor — no hardcoded fields or per-modality methods to -/// maintain. +/// Flat container of [`Arc`] in precedence order — index +/// `0` is highest precedence. Shared between detection and redaction +/// pipelines via `Arc` so per-call handoff is a +/// refcount bump. #[derive(Default)] pub struct PolicyStore { - inner: TypeMap, + policies: Vec>, } impl PolicyStore { - /// Construct a store from a `Vec` submission, taking - /// ownership of the policies (so each [`Policy`] is moved - /// straight into its [`Arc`] — no deep clone). - pub(crate) fn from_any_policies(policies: Vec) -> Self { - use crate::modality::{Audio, Image, Tabular, Text}; - - let mut store = Self::default(); - for any in policies { - match any { - AnyPolicy::Text(p) => store.push::(Arc::new(p)), - AnyPolicy::Tabular(p) => store.push::(Arc::new(p)), - AnyPolicy::Image(p) => store.push::(Arc::new(p)), - AnyPolicy::Audio(p) => store.push::