From 2660a560a2ba23db2a68d78053cc33f83ad71cef Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sat, 13 Jun 2026 17:39:25 +0200 Subject: [PATCH 1/3] refactor(core): replace closed EntityKind enum with open EntityLabel system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves #252. Switches the workspace from a fixed `EntityKind` enum to an open-vocabulary label system so deployments can mint custom entity labels without touching workspace code. Shape: - `EntityLabel` carries name + optional description + free-form tags. - `EntityLabelRef` is a `HipStr`-wrapped name-only handle stored on every detected `Entity` (cheap clone, hot-path type). - `EntityLabelCatalog` is a runtime-constructed name-indexed lookup; the workspace ships 66 built-in labels in `entity::builtins`, each reachable as a constant (`builtins::PERSON_NAME`, …) and bundled into `EntityLabelCatalog::with_builtins()`. - `EntitySelector` matches by `labels: Vec` plus `tags: Vec` (deferenced via the catalog). Deletes `EntityCategory` entirely; category groupings (`pii`, `phi`, `pci`, `personal_identity`, `financial`, …) survive as tags on each built-in label so selectors keep the same expressive power without an enum. Updates every recognizer (pattern, NER, LLM), nvisy-fake's per-label generator dispatch, nvisy-toolkit dedup + redaction registries, all asset TOMLs (`entity_kind` → `label`), and workspace-wide tests. --- .../nvisy-codec/src/handler/audio/duration.rs | 18 +- .../src/handler/audio/mp3_codec.rs | 43 +- .../src/handler/audio/mp3_handler.rs | 8 +- .../src/handler/audio/mp3_loader.rs | 7 +- crates/nvisy-core/src/context/enhancer.rs | 7 +- crates/nvisy-core/src/entity/annotation.rs | 37 +- crates/nvisy-core/src/entity/category.rs | 72 --- crates/nvisy-core/src/entity/kind.rs | 508 ------------------ .../nvisy-core/src/entity/label/builtins.rs | 176 ++++++ .../src/entity/label/entity_label.rs | 174 ++++++ .../src/entity/label/entity_label_catalog.rs | 115 ++++ .../src/entity/label/entity_label_ref.rs | 145 +++++ crates/nvisy-core/src/entity/label/mod.rs | 28 + .../src/entity/method/provenance.rs | 5 +- crates/nvisy-core/src/entity/mod.rs | 29 +- .../src/primitive/confidence/value.rs | 1 - crates/nvisy-core/src/recognition/hint.rs | 18 +- .../nvisy-core/src/recognition/label_map.rs | 124 +++-- crates/nvisy-engine/src/phases/detection.rs | 8 +- .../nvisy-engine/src/phases/redaction/mod.rs | 2 +- .../src/pipeline/config/detection/mod.rs | 24 +- .../src/pipeline/config/detection/plan.rs | 8 +- .../src/pipeline/redaction/applicator.rs | 55 +- .../src/pipeline/redaction/override_.rs | 6 +- .../src/pipeline/redaction/pipeline.rs | 1 - crates/nvisy-engine/src/policy/mod.rs | 8 +- crates/nvisy-engine/src/policy/selector.rs | 56 +- crates/nvisy-fake/src/anonymizer/mod.rs | 56 +- crates/nvisy-fake/src/generator/mod.rs | 210 ++++---- .../nvisy-llm/src/recognition/candidates.rs | 9 +- .../nvisy-llm/src/recognition/file_prompt.rs | 27 +- crates/nvisy-llm/src/recognition/lift.rs | 54 +- .../nvisy-llm/src/recognition/text_prompt.rs | 5 +- .../nvisy-llm/src/recognition/vlm_prompt.rs | 5 +- crates/nvisy-llm/tests/file_prompt.rs | 36 +- crates/nvisy-ner/src/backend/bento_backend.rs | 37 +- crates/nvisy-ner/src/backend/bento_types.rs | 19 +- crates/nvisy-ner/src/backend/mod.rs | 13 +- crates/nvisy-ner/src/backend/ner_backend.rs | 22 +- crates/nvisy-ner/src/backend/ner_span.rs | 5 +- crates/nvisy-ner/src/recognition/config.rs | 18 +- .../nvisy-ner/src/recognition/recognizer.rs | 47 +- .../finance/cryptocurrencies.toml | 2 +- .../dictionaries/finance/currencies.toml | 2 +- .../dictionaries/general/languages.toml | 2 +- .../dictionaries/general/nationalities.toml | 2 +- .../dictionaries/general/religions.toml | 2 +- .../assets/patterns/contact/email.toml | 2 +- .../assets/patterns/contact/phone.toml | 2 +- .../assets/patterns/contact/url.toml | 2 +- .../assets/patterns/credentials/aws_key.toml | 2 +- .../patterns/credentials/generic_api_key.toml | 2 +- .../patterns/credentials/github_token.toml | 2 +- .../patterns/credentials/private_key.toml | 2 +- .../patterns/credentials/stripe_key.toml | 2 +- .../patterns/finance/bitcoin_address.toml | 2 +- .../assets/patterns/finance/credit_card.toml | 2 +- .../patterns/finance/ethereum_address.toml | 2 +- .../assets/patterns/finance/iban.toml | 2 +- .../assets/patterns/finance/swift_code.toml | 2 +- .../patterns/finance/us_bank_routing.toml | 2 +- .../assets/patterns/identity/ssn.toml | 2 +- .../patterns/identity/us_drivers_license.toml | 2 +- .../assets/patterns/identity/us_passport.toml | 2 +- .../patterns/identity/us_postal_code.toml | 2 +- .../assets/patterns/network/ipv4.toml | 2 +- .../assets/patterns/network/ipv6.toml | 2 +- .../assets/patterns/network/mac_address.toml | 2 +- .../patterns/personal/date_of_birth.toml | 2 +- .../assets/patterns/personal/datetime.toml | 2 +- .../src/recognition/dictionary.rs | 12 +- .../src/recognition/recognizer.rs | 18 +- .../src/recognition/regex_rule.rs | 8 +- .../testdata/dictionaries/product_codes.toml | 2 +- .../testdata/patterns/employee_id.toml | 2 +- .../testdata/patterns/product_codes.toml | 2 +- .../nvisy-pattern/tests/enhancer_roundtrip.rs | 4 +- .../nvisy-pattern/tests/shipped_detection.rs | 85 ++- crates/nvisy-pattern/tests/user_rules.rs | 6 +- crates/nvisy-toolkit/examples/pipeline.rs | 16 +- .../src/deduplication/filter/mod.rs | 61 +-- .../src/deduplication/fuse/group.rs | 10 +- .../src/deduplication/fuse/key.rs | 6 +- .../nvisy-toolkit/src/deduplication/params.rs | 8 +- .../src/deduplication/pipeline.rs | 2 +- .../src/deduplication/resolve/mod.rs | 24 +- crates/nvisy-toolkit/src/detection/chunks.rs | 2 +- .../src/redaction/anonymizer/encrypt.rs | 4 +- .../src/redaction/anonymizer/hash.rs | 4 +- .../src/redaction/anonymizer/keep.rs | 2 +- .../src/redaction/anonymizer/mask.rs | 4 +- .../src/redaction/anonymizer/mod.rs | 4 +- .../src/redaction/anonymizer/replace.rs | 40 +- .../src/redaction/deanonymizer/decrypt.rs | 6 +- .../nvisy-toolkit/src/redaction/registry.rs | 127 +++-- crates/nvisy-toolkit/tests/codec_e2e_csv.rs | 66 ++- crates/nvisy-toolkit/tests/codec_e2e_html.rs | 18 +- crates/nvisy-toolkit/tests/codec_e2e_json.rs | 29 +- crates/nvisy-toolkit/tests/codec_e2e_txt.rs | 17 +- .../nvisy-toolkit/tests/fixtures/asserts.rs | 27 +- .../tests/fixtures/registries.rs | 25 +- .../tests/recognition_registry.rs | 9 +- 102 files changed, 1577 insertions(+), 1375 deletions(-) delete mode 100644 crates/nvisy-core/src/entity/category.rs delete mode 100644 crates/nvisy-core/src/entity/kind.rs create mode 100644 crates/nvisy-core/src/entity/label/builtins.rs create mode 100644 crates/nvisy-core/src/entity/label/entity_label.rs create mode 100644 crates/nvisy-core/src/entity/label/entity_label_catalog.rs create mode 100644 crates/nvisy-core/src/entity/label/entity_label_ref.rs create mode 100644 crates/nvisy-core/src/entity/label/mod.rs diff --git a/crates/nvisy-codec/src/handler/audio/duration.rs b/crates/nvisy-codec/src/handler/audio/duration.rs index a09e62ec..421ccfa5 100644 --- a/crates/nvisy-codec/src/handler/audio/duration.rs +++ b/crates/nvisy-codec/src/handler/audio/duration.rs @@ -44,16 +44,18 @@ const TARGET: &str = "nvisy_codec::handler::audio::duration"; /// first track lacks a timebase or a known duration, or when the /// computed duration would overflow `i64` microseconds. pub(super) fn probe_duration_us(bytes: &Bytes, extension_hint: &str) -> Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension(extension_hint); let reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("audio probe failed: {e}"), TARGET))?; let track = reader @@ -61,9 +63,9 @@ pub(super) fn probe_duration_us(bytes: &Bytes, extension_hint: &str) -> Result Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension("mp3"); let reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("MP3 probe failed: {e}"), TARGET))?; let track = reader @@ -87,15 +89,17 @@ pub(super) fn probe_channels(bytes: &Bytes) -> Result { /// [`super::redact::apply`] helper and for handing back to /// [`encode_from_pcm`]. pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { - let mss = MediaSourceStream::new( - Box::new(Cursor::new(bytes.clone())), - Default::default(), - ); + let mss = MediaSourceStream::new(Box::new(Cursor::new(bytes.clone())), Default::default()); let mut hint = Hint::new(); hint.with_extension("mp3"); let mut reader = get_probe() - .probe(&hint, mss, FormatOptions::default(), MetadataOptions::default()) + .probe( + &hint, + mss, + FormatOptions::default(), + MetadataOptions::default(), + ) .map_err(|e| Error::validation(format!("MP3 probe failed: {e}"), TARGET))?; let track = reader @@ -110,9 +114,9 @@ pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { .ok_or_else(|| Error::validation("MP3 track is missing audio codec params", TARGET))? .clone(); - let sample_rate = audio_params.sample_rate.ok_or_else(|| { - Error::validation("MP3 track is missing a sample rate", TARGET) - })?; + let sample_rate = audio_params + .sample_rate + .ok_or_else(|| Error::validation("MP3 track is missing a sample rate", TARGET))?; let channels = audio_params .channels .as_ref() @@ -163,10 +167,7 @@ pub(super) fn decode_to_pcm(bytes: &Bytes) -> Result { continue; } Err(e) => { - return Err(Error::validation( - format!("MP3 decode failed: {e}"), - TARGET, - )); + return Err(Error::validation(format!("MP3 decode failed: {e}"), TARGET)); } } } @@ -250,8 +251,8 @@ pub(super) fn encode_from_pcm( ) -> Result, Error> { let bitrate = snap_bitrate(target_bitrate_bps); - let mut encoder = Builder::new() - .ok_or_else(|| Error::validation("LAME builder failed", TARGET))?; + let mut encoder = + Builder::new().ok_or_else(|| Error::validation("LAME builder failed", TARGET))?; encoder .set_sample_rate(sample_rate) .map_err(|e| Error::validation(format!("LAME sample-rate rejected: {e:?}"), TARGET))?; diff --git a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs index b37d6a7a..e3d376a6 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_handler.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_handler.rs @@ -24,10 +24,9 @@ use nvisy_core::modality::{Audio, AudioData, AudioLocation}; use nvisy_core::primitive::TimeSpan; use nvisy_core::redaction::Redactions; -use super::Mp3Loader; use super::duration::probe_duration_us; use super::mp3_codec::{decode_to_pcm, encode_from_pcm}; -use super::redact; +use super::{Mp3Loader, redact}; use crate::content::{ContentData, ContentSource}; use crate::{Chunk, Format, FormatId, Handler}; @@ -229,7 +228,10 @@ mod tests { // boundary where smear is biggest. let start = sr * 3 / 4; let end = sr * 5 / 4; - let mean_abs: f32 = decoded.samples[start..end].iter().map(|s| s.abs()).sum::() + let mean_abs: f32 = decoded.samples[start..end] + .iter() + .map(|s| s.abs()) + .sum::() / (end - start) as f32; assert!( mean_abs < 0.05, diff --git a/crates/nvisy-codec/src/handler/audio/mp3_loader.rs b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs index 6106b058..1b6010ec 100644 --- a/crates/nvisy-codec/src/handler/audio/mp3_loader.rs +++ b/crates/nvisy-codec/src/handler/audio/mp3_loader.rs @@ -68,20 +68,19 @@ mod tests { #[tokio::test] async fn accepts_stereo_mp3() { - let loader = Mp3Loader; let bytes = fixture_stereo_mp3(); let content = ContentData::new(ContentSource::new(), bytes); - loader.decode(content).await.expect("stereo MP3 should load"); + let handler = Mp3Loader.decode(content).await; + handler.expect("stereo MP3 should load"); } #[tokio::test] async fn rejects_garbage_bytes() { - let loader = Mp3Loader; let content = ContentData::new( ContentSource::new(), Bytes::from_static(b"definitely not an mp3"), ); - let err = loader.decode(content).await.unwrap_err(); + let err = Mp3Loader.decode(content).await.unwrap_err(); assert!(err.to_string().contains("MP3 probe failed")); } } diff --git a/crates/nvisy-core/src/context/enhancer.rs b/crates/nvisy-core/src/context/enhancer.rs index 1c0501ba..38ff3794 100644 --- a/crates/nvisy-core/src/context/enhancer.rs +++ b/crates/nvisy-core/src/context/enhancer.rs @@ -221,7 +221,8 @@ mod tests { use super::*; use crate::context::Context; use crate::entity::{ - EntityKind, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, + EntityLabelRef, ModelProvenance, PatternProvenance, TrailProvenance, TrailStepKind, + builtins, }; use crate::modality::{Text, TextLocation}; @@ -240,7 +241,7 @@ mod tests { format!("pattern `{name}` matched"), ); Entity::builder() - .with_entity_kind(EntityKind::GovernmentId) + .with_label(EntityLabelRef::from(builtins::GOVERNMENT_ID.name.clone())) .with_trail(vec![step]) .with_confidence(confidence) .with_location(TextLocation::new(span.start, span.end)) @@ -258,7 +259,7 @@ mod tests { format!("model `{name}` matched"), ); Entity::builder() - .with_entity_kind(EntityKind::PersonName) + .with_label(EntityLabelRef::from(builtins::PERSON_NAME.name.clone())) .with_trail(vec![step]) .with_confidence(confidence) .with_location(TextLocation::new(span.start, span.end)) diff --git a/crates/nvisy-core/src/entity/annotation.rs b/crates/nvisy-core/src/entity/annotation.rs index b399ced6..5f9b419b 100644 --- a/crates/nvisy-core/src/entity/annotation.rs +++ b/crates/nvisy-core/src/entity/annotation.rs @@ -25,7 +25,7 @@ use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use super::{AnnotationProvenance, Entity, EntityKind, TrailProvenance, TrailStep}; +use super::{AnnotationProvenance, Entity, EntityLabelRef, TrailProvenance, TrailStep, builtins}; use crate::modality::{Modality, Overlap}; use crate::primitive::Confidence; @@ -82,16 +82,14 @@ pub enum AnnotationStrength { pub enum AnnotationKind { /// Pre-identified region the user wants treated as sensitive. Inclusion { - /// Specific entity kind. `None` when the user wants the - /// region treated as sensitive without committing to a - /// kind — synthesised entities fall back to - /// [`EntityKind::Unresolved`]. The broad - /// [`EntityCategory`] is derived via - /// [`EntityKind::category`]. + /// Label to attach to the synthesised entity. `None` + /// when the user wants the region treated as sensitive + /// without committing to a kind — synthesised entities + /// then fall back to [`builtins::UNRESOLVED`]. /// - /// [`EntityCategory`]: super::EntityCategory - #[serde(skip_serializing_if = "Option::is_none")] - entity_kind: Option, + /// [`builtins::UNRESOLVED`]: super::builtins::UNRESOLVED + #[serde(default, skip_serializing_if = "Option::is_none")] + label: Option, /// Modality-specific location this inclusion targets. target: M::Location, /// Whether this is an advisory [`Hint`] (LLM may reject) or @@ -137,7 +135,7 @@ impl Annotation { /// [`Hint`]: AnnotationStrength::Hint pub fn to_inclusion_entity(&self) -> Option> { let AnnotationKind::Inclusion { - entity_kind, + label, target, strength: AnnotationStrength::Assert, } = &self.kind @@ -145,8 +143,12 @@ impl Annotation { return None; }; + let label_ref = label + .clone() + .unwrap_or_else(|| EntityLabelRef::from(builtins::UNRESOLVED.name.clone())); + let entity = Entity::builder() - .with_entity_kind(entity_kind.unwrap_or(EntityKind::Unresolved)) + .with_label(label_ref) .with_trail(vec![TrailStep::recognition( "annotation", Confidence::MAX, @@ -212,7 +214,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::entity::EntityCategory; + use crate::entity::builtins; use crate::modality::{Image, ImageLocation, Text, TextLocation}; use crate::primitive::BoundingBox; @@ -220,7 +222,7 @@ mod tests { Annotation { name: None, kind: AnnotationKind::Inclusion { - entity_kind: Some(EntityKind::PersonName), + label: Some(EntityLabelRef::from(builtins::PERSON_NAME.name.clone())), target: TextLocation::new(start, end), strength, }, @@ -263,14 +265,13 @@ mod tests { let ann: Annotation = Annotation { name: None, kind: AnnotationKind::Inclusion { - entity_kind: None, + label: None, target: TextLocation::new(0, 10), strength: AnnotationStrength::Assert, }, }; let entity = ann.to_inclusion_entity().unwrap(); - assert_eq!(entity.category(), EntityCategory::Unresolved); - assert_eq!(entity.entity_kind, EntityKind::Unresolved); + assert_eq!(entity.label.as_str(), builtins::UNRESOLVED.name.as_str()); } #[test] @@ -285,7 +286,7 @@ mod tests { let ann: Annotation = Annotation { name: Some("face".into()), kind: AnnotationKind::Inclusion { - entity_kind: Some(EntityKind::PersonName), + label: Some(EntityLabelRef::from(builtins::FACE.name.clone())), target: ImageLocation::new(bbox), strength: AnnotationStrength::Assert, }, diff --git a/crates/nvisy-core/src/entity/category.rs b/crates/nvisy-core/src/entity/category.rs deleted file mode 100644 index 652c0ef5..00000000 --- a/crates/nvisy-core/src/entity/category.rs +++ /dev/null @@ -1,72 +0,0 @@ -//! Broad entity category classification. -//! -//! [`EntityCategory`] groups related [`EntityKind`] -//! variants into policy-addressable buckets. Policy selectors can -//! target an entire category (e.g. "redact all financial data") without -//! enumerating individual kinds. -//! -//! [`EntityKind`]: super::EntityKind - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use strum::{Display, EnumString}; - -/// Broad category of sensitive data. -/// -/// Each [`EntityKind`] maps to exactly one category -/// via [`EntityKind::category()`]. -/// -/// [`EntityKind`]: super::EntityKind -/// [`EntityKind::category()`]: super::EntityKind::category -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash)] -#[derive(Display, EnumString, Serialize, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -#[non_exhaustive] -pub enum EntityCategory { - /// Personal identity: names, government IDs, dates of birth, and - /// other attributes that directly identify a natural person. - PersonalIdentity, - /// Contact information: email addresses, phone numbers, physical - /// addresses, postal codes, and URLs. - ContactInfo, - /// Demographic attributes: age, gender, ethnicity, religion, - /// nationality, and citizenship. - Demographic, - /// Financial instruments and accounts: payment cards, bank - /// accounts, routing numbers, IBAN, crypto addresses, and - /// monetary amounts. - Financial, - /// Protected health information: medical record numbers, - /// insurance IDs, prescriptions, diagnoses, and medications. - Health, - /// Biometric identifiers: fingerprints, voiceprints, retina - /// scans, and facial geometry templates. - Biometric, - /// Secrets and credentials: passwords, API keys, authentication - /// tokens, and private cryptographic keys. - Credentials, - /// Network and device identifiers: IP addresses, MAC addresses, - /// device IDs, and usernames. - NetworkIdentifier, - /// Geographic and spatial data: GPS coordinates and geolocation - /// metadata. - Location, - /// Sensitive visual elements detected in images or video: - /// faces, handwriting, signatures, logos, and barcodes. - Visual, - /// Organizational identifiers: company names, departments, - /// facilities, and institutional reference numbers. - Organizational, - /// General-purpose entities surfaced by zero-shot models that - /// are not strictly PII but are routinely useful for policy - /// routing or document structuring: events, occupations, - /// products, quantities. - GeneralPurpose, - /// Fallback bucket for entities a recognizer flagged as sensitive - /// but could not place into a more specific category. Use sparingly - /// — every recognizer should prefer a precise category when one - /// exists. - #[default] - Unresolved, -} diff --git a/crates/nvisy-core/src/entity/kind.rs b/crates/nvisy-core/src/entity/kind.rs deleted file mode 100644 index c435de7c..00000000 --- a/crates/nvisy-core/src/entity/kind.rs +++ /dev/null @@ -1,508 +0,0 @@ -//! Concrete entity kind enumeration. -//! -//! [`EntityKind`] enumerates the types of sensitive data the platform -//! can detect or redact. Each variant maps to a stable `snake_case` -//! string for serialization and display, and to an [`EntityCategory`] -//! via [`EntityKind::category`]. - -use schemars::JsonSchema; -use serde::{Deserialize, Serialize}; -use strum::{Display, EnumIter, EnumString, IntoEnumIterator}; - -use super::category::EntityCategory; - -/// Specific kind of sensitive entity detected or targeted for redaction. -#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Hash, Display)] -#[derive(EnumIter, EnumString, Serialize, Deserialize, JsonSchema)] -#[serde(rename_all = "snake_case")] -#[strum(serialize_all = "snake_case")] -#[non_exhaustive] -pub enum EntityKind { - // Personal identity - /// Person name (full, first, or last). - PersonName, - /// Date of birth. - DateOfBirth, - /// Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.). - GovernmentId, - /// Tax identification number (ITIN, EIN, TIN, etc.). - TaxId, - /// Driver's license number. - DriversLicense, - /// Passport number. - PassportNumber, - /// National insurance or social-security equivalent (NI, BSN, AHVN, etc.). - NationalInsuranceNumber, - /// Vehicle identification number (VIN). - VehicleId, - /// License plate number. - LicensePlate, - - // Contact information - /// Email address. - EmailAddress, - /// Phone number. - PhoneNumber, - /// Physical or mailing address. - Address, - /// Postal or ZIP code. - PostalCode, - /// URL or hyperlink. - Url, - - // Demographic - /// Age value. - Age, - /// Gender identity. - Gender, - /// Racial or ethnic background. - Ethnicity, - /// Religious affiliation. - Religion, - /// Nationality. - Nationality, - /// Citizenship status. - Citizenship, - /// Language or dialect spoken. - Language, - - // Financial - /// Payment card number (credit or debit). - PaymentCard, - /// Payment card security code (CVV/CVC). - CardSecurityCode, - /// Payment card expiration date. - CardExpiry, - /// Bank account number. - BankAccount, - /// Bank routing or transit number. - BankRouting, - /// International Bank Account Number (IBAN). - Iban, - /// SWIFT / BIC code. - SwiftCode, - /// Cryptocurrency wallet address. - CryptoAddress, - /// Currency name or ISO 4217 code (USD, US Dollar, EUR, BTC, - /// Bitcoin, …). Distinct from a concrete [`Amount`]. - /// - /// [`Amount`]: Self::Amount - Currency, - /// Monetary amount. - Amount, - - // Health - /// Medical or patient identifier. - MedicalId, - /// Insurance policy number. - InsuranceId, - /// Prescription number. - PrescriptionId, - /// Medical diagnosis or condition. - Diagnosis, - /// Drug or medication name in a patient context. - Medication, - - // Biometric - /// Fingerprint template or minutiae data. - Fingerprint, - /// Voiceprint or speaker embedding. - Voiceprint, - /// Retina or iris scan data. - RetinaScan, - /// Facial geometry or face embedding (not a photo: see [`Face`]). - /// - /// [`Face`]: Self::Face - FacialGeometry, - - // Credentials - /// Password or passphrase. - Password, - /// API key. - ApiKey, - /// Authentication or session token. - AuthToken, - /// Private cryptographic key. - PrivateKey, - - // Network and device identifiers - /// IP address (v4 or v6). - IpAddress, - /// MAC (hardware) address. - MacAddress, - /// Device identifier (IMEI, IDFA, etc.). - DeviceId, - /// Username or online handle. - Username, - - // Location - /// GPS coordinates (latitude / longitude). - Coordinates, - /// Geolocation metadata (EXIF, cell tower, etc.). - GeolocationMetadata, - - // Visual - /// Detected human face in an image. - Face, - /// Handwritten text region. - Handwriting, - /// Handwritten or digital signature. - Signature, - /// Logo or brand mark. - Logo, - /// Barcode (1D) or QR code (2D). - Barcode, - - // Organizational - /// Company or institution name. - OrganizationName, - /// Internal division or department name. - DepartmentName, - /// Physical facility name (hospital, office, school). - FacilityName, - /// Legal or administrative case identifier. - CaseNumber, - /// Internal reference number (invoice, contract, PO, employee number, membership ID). - InternalId, - - // Temporal - /// Date, time, or datetime value. - DateTime, - - // General-purpose NER labels (commonly emitted by zero-shot - // models like GLiNER): not strictly PII but useful to flag for - // policy routing, redaction overrides, or downstream - // structuring. - /// Event reference (conferences, weddings, public happenings). - Event, - /// Occupation, role, or job title. - Occupation, - /// Product, service, or model name. - Product, - /// Numeric quantity or measurement (distinct from monetary - /// [`Amount`]). - /// - /// [`Amount`]: Self::Amount - Quantity, - - /// Fallback kind for entities a recognizer flagged as sensitive - /// but could not classify into a more specific kind. Pairs with - /// [`EntityCategory::Unresolved`]. - #[default] - Unresolved, -} - -impl EntityKind { - /// Every defined [`EntityKind`] variant, in declaration order. - /// - /// Use with combinators to build category-filtered allowlists - /// without enumerating variants by hand: - /// - /// ```ignore - /// let text_kinds: Vec = EntityKind::all() - /// .filter(|k| !k.is_biometric() && !k.is_visual()) - /// .collect(); - /// ``` - pub fn all() -> impl Iterator { - ::iter() - } - - /// Returns the [`EntityCategory`] this entity kind belongs to. - pub fn category(&self) -> EntityCategory { - match self { - // Personal identity - Self::PersonName - | Self::DateOfBirth - | Self::GovernmentId - | Self::TaxId - | Self::DriversLicense - | Self::PassportNumber - | Self::NationalInsuranceNumber - | Self::VehicleId - | Self::LicensePlate => EntityCategory::PersonalIdentity, - - // Contact - Self::EmailAddress - | Self::PhoneNumber - | Self::Address - | Self::PostalCode - | Self::Url => EntityCategory::ContactInfo, - - // Demographic - Self::Age - | Self::Gender - | Self::Ethnicity - | Self::Religion - | Self::Nationality - | Self::Citizenship - | Self::Language => EntityCategory::Demographic, - - // Financial - Self::PaymentCard - | Self::CardSecurityCode - | Self::CardExpiry - | Self::BankAccount - | Self::BankRouting - | Self::Iban - | Self::SwiftCode - | Self::CryptoAddress - | Self::Currency - | Self::Amount => EntityCategory::Financial, - - // Health - Self::MedicalId - | Self::InsuranceId - | Self::PrescriptionId - | Self::Diagnosis - | Self::Medication => EntityCategory::Health, - - // Biometric - Self::Fingerprint | Self::Voiceprint | Self::RetinaScan | Self::FacialGeometry => { - EntityCategory::Biometric - } - - // Credentials - Self::Password | Self::ApiKey | Self::AuthToken | Self::PrivateKey => { - EntityCategory::Credentials - } - - // Network - Self::IpAddress | Self::MacAddress | Self::DeviceId | Self::Username => { - EntityCategory::NetworkIdentifier - } - - // Location - Self::Coordinates | Self::GeolocationMetadata => EntityCategory::Location, - - // Visual - Self::Face | Self::Handwriting | Self::Signature | Self::Logo | Self::Barcode => { - EntityCategory::Visual - } - - // Organizational - Self::OrganizationName - | Self::DepartmentName - | Self::FacilityName - | Self::CaseNumber - | Self::InternalId => EntityCategory::Organizational, - - // Temporal (grouped under PersonalIdentity: bare dates most - // commonly appear alongside personal data and are regulated - // as PII by GDPR/CCPA) - Self::DateTime => EntityCategory::PersonalIdentity, - - // General-purpose - Self::Event | Self::Occupation | Self::Product | Self::Quantity => { - EntityCategory::GeneralPurpose - } - - Self::Unresolved => EntityCategory::Unresolved, - } - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::PersonalIdentity`]. - #[must_use] - pub fn is_personal_identity(&self) -> bool { - self.category() == EntityCategory::PersonalIdentity - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::ContactInfo`]. - #[must_use] - pub fn is_contact_info(&self) -> bool { - self.category() == EntityCategory::ContactInfo - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Demographic`]. - #[must_use] - pub fn is_demographic(&self) -> bool { - self.category() == EntityCategory::Demographic - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Financial`]. - #[must_use] - pub fn is_financial(&self) -> bool { - self.category() == EntityCategory::Financial - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Health`]. - #[must_use] - pub fn is_health(&self) -> bool { - self.category() == EntityCategory::Health - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Biometric`]. - #[must_use] - pub fn is_biometric(&self) -> bool { - self.category() == EntityCategory::Biometric - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Credentials`]. - #[must_use] - pub fn is_credentials(&self) -> bool { - self.category() == EntityCategory::Credentials - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::NetworkIdentifier`]. - #[must_use] - pub fn is_network_identifier(&self) -> bool { - self.category() == EntityCategory::NetworkIdentifier - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Location`]. - #[must_use] - pub fn is_location(&self) -> bool { - self.category() == EntityCategory::Location - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Visual`]. - #[must_use] - pub fn is_visual(&self) -> bool { - self.category() == EntityCategory::Visual - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::Organizational`]. - #[must_use] - pub fn is_organizational(&self) -> bool { - self.category() == EntityCategory::Organizational - } - - /// Convenience predicate: this kind belongs to - /// [`EntityCategory::GeneralPurpose`]. - #[must_use] - pub fn is_general_purpose(&self) -> bool { - self.category() == EntityCategory::GeneralPurpose - } - - /// Convenience predicate: this kind has a recognisable - /// structural shape — fixed character-class layout, separators - /// at meaningful positions, fixed length — that anonymizers - /// and validators may want to preserve verbatim. - /// - /// True for: IBAN, payment cards (number / CVV / expiry), - /// bank accounts and routing, SWIFT/BIC, postal codes, phone - /// numbers, email addresses, dates and date-times, IP and MAC - /// addresses, license plates, coordinates. False for free-form - /// names, addresses, occupations, tokens, etc. - #[must_use] - pub fn is_structured(&self) -> bool { - matches!( - self, - Self::PaymentCard - | Self::CardSecurityCode - | Self::CardExpiry - | Self::Iban - | Self::BankAccount - | Self::BankRouting - | Self::SwiftCode - | Self::PostalCode - | Self::PhoneNumber - | Self::EmailAddress - | Self::DateOfBirth - | Self::DateTime - | Self::IpAddress - | Self::MacAddress - | Self::LicensePlate - | Self::Coordinates - ) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn category_personal_identity() { - assert_eq!( - EntityKind::GovernmentId.category(), - EntityCategory::PersonalIdentity - ); - assert_eq!( - EntityKind::PersonName.category(), - EntityCategory::PersonalIdentity - ); - assert_eq!( - EntityKind::DateOfBirth.category(), - EntityCategory::PersonalIdentity - ); - } - - #[test] - fn category_contact_info() { - assert_eq!( - EntityKind::EmailAddress.category(), - EntityCategory::ContactInfo - ); - assert_eq!(EntityKind::Address.category(), EntityCategory::ContactInfo); - } - - #[test] - fn category_demographic() { - assert_eq!(EntityKind::Gender.category(), EntityCategory::Demographic); - assert_eq!( - EntityKind::Ethnicity.category(), - EntityCategory::Demographic - ); - assert_eq!(EntityKind::Religion.category(), EntityCategory::Demographic); - } - - #[test] - fn category_financial() { - assert_eq!( - EntityKind::PaymentCard.category(), - EntityCategory::Financial - ); - assert_eq!(EntityKind::Iban.category(), EntityCategory::Financial); - } - - #[test] - fn category_health() { - assert_eq!(EntityKind::MedicalId.category(), EntityCategory::Health); - assert_eq!(EntityKind::Diagnosis.category(), EntityCategory::Health); - assert_eq!(EntityKind::Medication.category(), EntityCategory::Health); - } - - #[test] - fn category_credentials() { - assert_eq!(EntityKind::Password.category(), EntityCategory::Credentials); - assert_eq!(EntityKind::ApiKey.category(), EntityCategory::Credentials); - } - - #[test] - fn category_biometric() { - assert_eq!( - EntityKind::Fingerprint.category(), - EntityCategory::Biometric - ); - assert_eq!(EntityKind::Voiceprint.category(), EntityCategory::Biometric); - assert_eq!(EntityKind::RetinaScan.category(), EntityCategory::Biometric); - assert_eq!(EntityKind::Face.category(), EntityCategory::Visual); - } - - #[test] - fn category_organizational() { - assert_eq!( - EntityKind::OrganizationName.category(), - EntityCategory::Organizational - ); - assert_eq!( - EntityKind::CaseNumber.category(), - EntityCategory::Organizational - ); - assert_eq!( - EntityKind::InternalId.category(), - EntityCategory::Organizational - ); - } -} diff --git a/crates/nvisy-core/src/entity/label/builtins.rs b/crates/nvisy-core/src/entity/label/builtins.rs new file mode 100644 index 00000000..63fe3d94 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/builtins.rs @@ -0,0 +1,176 @@ +//! Built-in [`EntityLabel`] constants. +//! +//! Each constant carries a category tag (`personal_identity`, +//! `financial`, …) plus cross-cutting tags where applicable +//! (`pii`, `phi`, `pci`). Selectors can match by label name *or* +//! by tag without the workspace modelling categories as a +//! separate enum. +//! +//! The `BUILT_INS` slice indexes every constant for the +//! [`EntityLabelCatalog::with_builtins`][super::EntityLabelCatalog::with_builtins] +//! constructor; the constants themselves are public and reachable +//! by name (e.g. `builtins::PERSON_NAME`). + +use std::sync::LazyLock; + +use super::entity_label::EntityLabel; + +macro_rules! label { + ($vis:vis $ident:ident, $name:literal, $desc:literal, [ $($tag:literal),* $(,)? ]) => { + $vis static $ident: LazyLock = LazyLock::new(|| { + EntityLabel::from_static($name, Some($desc), &[$($tag),*]) + }); + }; +} + +label!(pub PERSON_NAME, "person_name","Person name (full, first, or last).", ["personal_identity", "pii"]); +label!(pub DATE_OF_BIRTH, "date_of_birth","Date of birth.", ["personal_identity", "pii"]); +label!(pub GOVERNMENT_ID, "government_id","Government-issued identification number (SSN, SIN, Aadhaar, national ID, etc.).", ["personal_identity", "pii"]); +label!(pub TAX_ID, "tax_id","Tax identification number (ITIN, EIN, TIN, etc.).", ["personal_identity", "pii"]); +label!(pub DRIVERS_LICENSE, "drivers_license","Driver's license number.", ["personal_identity", "pii"]); +label!(pub PASSPORT_NUMBER, "passport_number","Passport number.", ["personal_identity", "pii"]); +label!(pub NATIONAL_INSURANCE_NUMBER, "national_insurance_number","National insurance or social-security equivalent (NI, BSN, AHVN, etc.).", ["personal_identity", "pii"]); +label!(pub VEHICLE_ID, "vehicle_id","Vehicle identification number (VIN).", ["personal_identity"]); +label!(pub LICENSE_PLATE, "license_plate","License plate number.", ["personal_identity"]); +label!(pub EMAIL_ADDRESS, "email_address","Email address.", ["contact_info", "pii"]); +label!(pub PHONE_NUMBER, "phone_number","Phone number.", ["contact_info", "pii"]); +label!(pub ADDRESS, "address","Physical or mailing address.", ["contact_info", "pii"]); +label!(pub POSTAL_CODE, "postal_code","Postal or ZIP code.", ["contact_info"]); +label!(pub URL, "url","URL or hyperlink.", ["contact_info"]); +label!(pub AGE, "age","Age value.", ["demographic", "pii"]); +label!(pub GENDER, "gender","Gender identity.", ["demographic", "pii"]); +label!(pub ETHNICITY, "ethnicity","Racial or ethnic background.", ["demographic", "pii"]); +label!(pub RELIGION, "religion","Religious affiliation.", ["demographic", "pii"]); +label!(pub NATIONALITY, "nationality","Nationality.", ["demographic", "pii"]); +label!(pub CITIZENSHIP, "citizenship","Citizenship status.", ["demographic", "pii"]); +label!(pub LANGUAGE, "language","Language or dialect spoken.", ["demographic"]); +label!(pub PAYMENT_CARD, "payment_card","Payment card number (credit or debit).", ["financial", "pci", "pii"]); +label!(pub CARD_SECURITY_CODE, "card_security_code","Payment card security code (CVV/CVC).", ["financial", "pci"]); +label!(pub CARD_EXPIRY, "card_expiry","Payment card expiration date.", ["financial", "pci"]); +label!(pub BANK_ACCOUNT, "bank_account","Bank account number.", ["financial", "pii"]); +label!(pub BANK_ROUTING, "bank_routing","Bank routing or transit number.", ["financial"]); +label!(pub IBAN, "iban","International Bank Account Number (IBAN).", ["financial", "pii"]); +label!(pub SWIFT_CODE, "swift_code","SWIFT/BIC code.", ["financial"]); +label!(pub CRYPTO_ADDRESS, "crypto_address","Cryptocurrency wallet address.", ["financial", "pii"]); +label!(pub CURRENCY, "currency","Currency code or symbol.", ["financial"]); +label!(pub AMOUNT, "amount","Monetary amount.", ["financial"]); +label!(pub MEDICAL_ID, "medical_id","Medical record number.", ["health", "phi", "pii"]); +label!(pub INSURANCE_ID, "insurance_id","Health insurance identifier.", ["health", "phi", "pii"]); +label!(pub PRESCRIPTION_ID, "prescription_id","Prescription identifier or medication regimen.", ["health", "phi"]); +label!(pub DIAGNOSIS, "diagnosis","Medical diagnosis or condition.", ["health", "phi"]); +label!(pub MEDICATION, "medication","Medication name.", ["health", "phi"]); +label!(pub FINGERPRINT, "fingerprint","Fingerprint biometric data.", ["biometric", "pii"]); +label!(pub VOICEPRINT, "voiceprint","Voiceprint biometric data.", ["biometric", "pii"]); +label!(pub RETINA_SCAN, "retina_scan","Retina scan biometric data.", ["biometric", "pii"]); +label!(pub FACIAL_GEOMETRY, "facial_geometry","Facial geometry biometric data.", ["biometric", "pii"]); +label!(pub PASSWORD, "password","Password.", ["credentials", "secret"]); +label!(pub API_KEY, "api_key","API key.", ["credentials", "secret"]); +label!(pub AUTH_TOKEN, "auth_token","Authentication token (OAuth, JWT, session token).", ["credentials", "secret"]); +label!(pub PRIVATE_KEY, "private_key","Private cryptographic key.", ["credentials", "secret"]); +label!(pub IP_ADDRESS, "ip_address","IP address (v4 or v6).", ["network_identifier", "pii"]); +label!(pub MAC_ADDRESS, "mac_address","MAC address.", ["network_identifier", "pii"]); +label!(pub DEVICE_ID, "device_id","Device identifier (IMEI, UDID, etc.).", ["network_identifier", "pii"]); +label!(pub USERNAME, "username","Username or handle.", ["network_identifier", "pii"]); +label!(pub COORDINATES, "coordinates","GPS coordinates (latitude/longitude).", ["location", "pii"]); +label!(pub GEOLOCATION_METADATA, "geolocation_metadata","Geolocation metadata.", ["location", "pii"]); +label!(pub FACE, "face","Human face detected in an image or video frame.", ["visual", "pii"]); +label!(pub HANDWRITING, "handwriting","Handwritten text.", ["visual"]); +label!(pub SIGNATURE, "signature","Handwritten signature.", ["visual", "pii"]); +label!(pub LOGO, "logo","Brand or organisation logo.", ["visual"]); +label!(pub BARCODE, "barcode","Barcode or QR code.", ["visual"]); +label!(pub ORGANIZATION_NAME, "organization_name","Organization or company name.", ["organization"]); +label!(pub DEPARTMENT_NAME, "department_name","Department or business-unit name.", ["organization"]); +label!(pub FACILITY_NAME, "facility_name","Physical facility or location name.", ["organization"]); +label!(pub CASE_NUMBER, "case_number","Case, matter, or docket number.", ["organization"]); +label!(pub INTERNAL_ID, "internal_id","Operator-defined internal identifier.", ["organization"]); +label!(pub DATE_TIME, "date_time","Date or time value.", ["temporal"]); +label!(pub EVENT, "event","Named event reference.", ["temporal"]); +label!(pub OCCUPATION, "occupation","Occupation or job title.", ["organization"]); +label!(pub PRODUCT, "product","Product name.", ["organization"]); +label!(pub QUANTITY, "quantity","Numerical quantity.", ["quantity"]); +label!(pub UNRESOLVED, "unresolved","Entity kind not yet identified.", ["unresolved"]); + +/// Every built-in label constant, indexed for catalog construction. +pub(super) static BUILT_INS: &[&LazyLock] = &[ + &PERSON_NAME, + &DATE_OF_BIRTH, + &GOVERNMENT_ID, + &TAX_ID, + &DRIVERS_LICENSE, + &PASSPORT_NUMBER, + &NATIONAL_INSURANCE_NUMBER, + &VEHICLE_ID, + &LICENSE_PLATE, + &EMAIL_ADDRESS, + &PHONE_NUMBER, + &ADDRESS, + &POSTAL_CODE, + &URL, + &AGE, + &GENDER, + ÐNICITY, + &RELIGION, + &NATIONALITY, + &CITIZENSHIP, + &LANGUAGE, + &PAYMENT_CARD, + &CARD_SECURITY_CODE, + &CARD_EXPIRY, + &BANK_ACCOUNT, + &BANK_ROUTING, + &IBAN, + &SWIFT_CODE, + &CRYPTO_ADDRESS, + &CURRENCY, + &AMOUNT, + &MEDICAL_ID, + &INSURANCE_ID, + &PRESCRIPTION_ID, + &DIAGNOSIS, + &MEDICATION, + &FINGERPRINT, + &VOICEPRINT, + &RETINA_SCAN, + &FACIAL_GEOMETRY, + &PASSWORD, + &API_KEY, + &AUTH_TOKEN, + &PRIVATE_KEY, + &IP_ADDRESS, + &MAC_ADDRESS, + &DEVICE_ID, + &USERNAME, + &COORDINATES, + &GEOLOCATION_METADATA, + &FACE, + &HANDWRITING, + &SIGNATURE, + &LOGO, + &BARCODE, + &ORGANIZATION_NAME, + &DEPARTMENT_NAME, + &FACILITY_NAME, + &CASE_NUMBER, + &INTERNAL_ID, + &DATE_TIME, + &EVENT, + &OCCUPATION, + &PRODUCT, + &QUANTITY, + &UNRESOLVED, +]; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn well_known_built_ins_have_expected_tags() { + assert_eq!(PAYMENT_CARD.name, "payment_card"); + assert!(PAYMENT_CARD.has_tag("financial")); + assert!(PAYMENT_CARD.has_tag("pci")); + assert!(PAYMENT_CARD.has_tag("pii")); + assert_eq!(PERSON_NAME.name, "person_name"); + assert!(PERSON_NAME.has_tag("personal_identity")); + } +} diff --git a/crates/nvisy-core/src/entity/label/entity_label.rs b/crates/nvisy-core/src/entity/label/entity_label.rs new file mode 100644 index 00000000..329f5189 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/entity_label.rs @@ -0,0 +1,174 @@ +//! [`EntityLabel`] — open vocabulary tag for detected entities. +//! +//! Any recognizer can mint a label by name, ship it through the +//! pipeline, and have the audit reference it verbatim. The workspace +//! ships a catalog of built-in labels in [`super::builtins`]; +//! recognizers and policy authors are free to invent new ones +//! (`acme-internal-id`, `medical-record-no`) without touching +//! workspace code. +//! +//! ## Identity +//! +//! Labels are identified by [`name`]; two labels with the same name +//! are considered the same entity kind regardless of differences in +//! [`description`] or [`tags`]. Selectors match by name. +//! +//! ## Tags +//! +//! [`tags`] is a free-form list of short identifiers that policy +//! selectors can match against. Built-in labels carry category +//! tags (`personal_identity`, `contact_info`, `financial`, etc.) +//! plus cross-cutting tags (`pii`, `phi`, `pci`). Custom labels +//! can ship with zero tags; selectors targeting tags only match +//! labels that carry them. +//! +//! [`name`]: EntityLabel::name +//! [`description`]: EntityLabel::description +//! [`tags`]: EntityLabel::tags + +use hipstr::HipStr; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +use super::EntityLabelRef; + +/// Open-vocabulary entity label: identity, optional description, +/// and zero or more tags. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct EntityLabel { + /// Canonical name of the label (e.g. `"person_name"`, + /// `"acme_internal_id"`). Selectors match by this value. + #[schemars(with = "String")] + pub name: HipStr<'static>, + /// Optional human-readable description of what the label + /// represents. Surfaced in audits and policy author tooling. + #[serde(default, skip_serializing_if = "Option::is_none")] + #[schemars(with = "Option")] + pub description: Option>, + /// Free-form tags grouping this label with related ones. + /// Built-in labels carry category tags + /// (`personal_identity`, `financial`, …) plus cross-cutting + /// tags where applicable (`pii`, `phi`, `pci`). Empty for + /// untagged custom labels. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + #[schemars(with = "Vec")] + pub tags: Vec>, +} + +impl EntityLabel { + /// Construct a label from a runtime name. Description and tags + /// default to empty; use [`Self::with_description`] and + /// [`Self::with_tags`] to add them. + pub fn new(name: impl Into>) -> Self { + Self { + name: name.into(), + description: None, + tags: Vec::new(), + } + } + + /// Construct a label entirely from `&'static str` literals. + /// Used by the built-in catalog in [`super::builtins`] so the + /// strings live in static storage and runtime construction is + /// just one `Vec::from` per built-in. + pub fn from_static( + name: &'static str, + description: Option<&'static str>, + tags: &'static [&'static str], + ) -> Self { + Self { + name: HipStr::from_static(name), + description: description.map(HipStr::from_static), + tags: tags.iter().copied().map(HipStr::from_static).collect(), + } + } + + /// Attach a description. + #[must_use] + pub fn with_description(mut self, description: impl Into>) -> Self { + self.description = Some(description.into()); + self + } + + /// Attach tags. Replaces any previously set tags. + #[must_use] + pub fn with_tags(mut self, tags: I) -> Self + where + I: IntoIterator, + S: Into>, + { + self.tags = tags.into_iter().map(Into::into).collect(); + self + } + + /// Returns `true` when this label carries `tag` in its tag + /// list. Comparison is byte-for-byte. + #[must_use] + pub fn has_tag(&self, tag: &str) -> bool { + self.tags.iter().any(|t| t == tag) + } + + /// Construct a name-only [`EntityLabelRef`][super::EntityLabelRef] + /// handle to this label. Clones the underlying [`HipStr`] + /// (a refcount bump for `from_static` labels — no allocation). + #[must_use] + pub fn label_ref(&self) -> EntityLabelRef { + EntityLabelRef::from(self.name.clone()) + } +} + +impl AsRef for EntityLabel { + fn as_ref(&self) -> &str { + &self.name + } +} + +impl std::fmt::Display for EntityLabel { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.name, f) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_static_round_trips() { + let l = EntityLabel::from_static( + "email_address", + Some("Email address."), + &["contact_info", "pii"], + ); + assert_eq!(l.name, "email_address"); + assert_eq!(l.description.as_deref(), Some("Email address.")); + assert!(l.has_tag("contact_info")); + assert!(l.has_tag("pii")); + assert!(!l.has_tag("financial")); + } + + #[test] + fn builder_setters_chain() { + let l = EntityLabel::new("acme_internal_id") + .with_description("ACME corp internal record id") + .with_tags(["custom", "acme"]); + assert_eq!(l.name, "acme_internal_id"); + assert_eq!( + l.description.as_deref(), + Some("ACME corp internal record id"), + ); + assert!(l.has_tag("acme")); + } + + #[test] + fn equality_ignores_metadata() { + // NOTE: deliberately *not* the behaviour today — `derive(PartialEq)` + // makes equality structural. If selectors need name-only equality + // they should compare `.name` explicitly. This test documents the + // current contract so a future change is intentional. + let a = EntityLabel::new("person_name").with_tags(["pii"]); + let b = EntityLabel::new("person_name"); + assert_ne!(a, b); + } +} diff --git a/crates/nvisy-core/src/entity/label/entity_label_catalog.rs b/crates/nvisy-core/src/entity/label/entity_label_catalog.rs new file mode 100644 index 00000000..89f02d12 --- /dev/null +++ b/crates/nvisy-core/src/entity/label/entity_label_catalog.rs @@ -0,0 +1,115 @@ +//! [`EntityLabelCatalog`] — name-indexed lookup over a set of +//! [`EntityLabel`]s. +//! +//! Constructed at runtime configuration time. Recognizers' +//! supported labels and selectors' tag-matching path both walk a +//! `EntityLabelCatalog`. The workspace ships a built-in catalog through +//! [`EntityLabelCatalog::with_builtins`]; deployments can register their own +//! labels alongside or instead of the built-ins via +//! [`EntityLabelCatalog::with_label`] / [`EntityLabelCatalog::with_labels`]. + +use std::collections::HashMap; +use std::sync::LazyLock; + +use hipstr::HipStr; + +use super::builtins::BUILT_INS; +use super::entity_label::EntityLabel; + +/// Name-indexed catalog of [`EntityLabel`]s. +/// +/// Built from a list of labels (mixing workspace-shipped built-ins +/// with deployment-defined custom labels). Construction copies each +/// label into a [`HashMap`] keyed by `HipStr` clone of the label's +/// name; subsequent lookups are O(1). +#[derive(Debug, Clone, Default)] +pub struct EntityLabelCatalog { + by_name: HashMap, EntityLabel>, +} + +impl EntityLabelCatalog { + /// Empty catalog. Built-ins must be registered explicitly via + /// [`Self::with_label`] / [`Self::with_labels`]; use + /// [`Self::with_builtins`] for the workspace-shipped set. + pub fn new() -> Self { + Self::default() + } + + /// EntityLabelCatalog pre-populated with every workspace-shipped built-in + /// label. + pub fn with_builtins() -> Self { + let mut cat = Self::new(); + for lazy in BUILT_INS { + cat.insert(LazyLock::force(lazy).clone()); + } + cat + } + + /// Register a single label. Replaces any prior entry sharing + /// the same [`EntityLabel::name`]. + pub fn insert(&mut self, label: EntityLabel) { + self.by_name.insert(label.name.clone(), label); + } + + /// Builder-style sibling of [`Self::insert`] returning `Self`. + #[must_use] + pub fn with_label(mut self, label: EntityLabel) -> Self { + self.insert(label); + self + } + + /// Bulk-register a sequence of labels. + #[must_use] + pub fn with_labels(mut self, labels: I) -> Self + where + I: IntoIterator, + { + for l in labels { + self.insert(l); + } + self + } + + /// Look up a label by name. Returns `None` for names not + /// registered in this catalog. + pub fn lookup(&self, name: &str) -> Option<&EntityLabel> { + self.by_name.get(name) + } + + /// Iterator over every registered label, in no particular order. + pub fn iter(&self) -> impl Iterator + '_ { + self.by_name.values() + } + + /// Number of labels in the catalog. + pub fn len(&self) -> usize { + self.by_name.len() + } + + /// `true` when the catalog is empty. + pub fn is_empty(&self) -> bool { + self.by_name.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn builtin_catalog_resolves_known_names() { + let cat = EntityLabelCatalog::with_builtins(); + let l = cat.lookup("payment_card").expect("built-in"); + assert!(l.has_tag("financial")); + assert!(cat.lookup("acme_internal_id").is_none()); + } + + #[test] + fn catalog_accepts_custom_labels_alongside_builtins() { + let custom = EntityLabel::new("acme_internal_id").with_tags(["custom"]); + let cat = EntityLabelCatalog::with_builtins().with_label(custom); + assert!(cat.lookup("payment_card").is_some()); + let acme = cat.lookup("acme_internal_id").expect("custom registered"); + assert!(acme.has_tag("custom")); + } +} diff --git a/crates/nvisy-core/src/entity/label/entity_label_ref.rs b/crates/nvisy-core/src/entity/label/entity_label_ref.rs new file mode 100644 index 00000000..917519ef --- /dev/null +++ b/crates/nvisy-core/src/entity/label/entity_label_ref.rs @@ -0,0 +1,145 @@ +//! [`EntityLabelRef`] — name-only handle to an [`EntityLabel`]. +//! +//! Per-entity hot paths (`Entity::label`, audit refs, selector +//! matching) carry only the label's identifying name, not its full +//! catalog metadata. [`EntityLabelRef`] wraps a [`HipStr<'static>`] +//! so the surface is a single newtype rather than a bare string, +//! giving us a typed receiver for ergonomics like +//! `entity.label.matches("payment_card")`. +//! +//! Catalog-side metadata (description, tags) lives on +//! [`EntityLabel`] and is dereferenced through +//! [`EntityLabelCatalog::lookup`][catalog-lookup] when a consumer needs it. +//! +//! [`EntityLabel`]: super::EntityLabel +//! [catalog-lookup]: super::EntityLabelCatalog::lookup + +use std::fmt; + +use hipstr::HipStr; +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; + +/// Name-only handle to an [`EntityLabel`][super::EntityLabel]. +/// Cheap-clone wrapper around [`HipStr<'static>`]. +/// +/// Carried on every [`Entity`][crate::entity::Entity] in place of +/// the full catalog metadata. Two refs are equal when their names +/// are equal byte-for-byte. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] +#[serde(transparent)] +#[schemars(with = "String")] +pub struct EntityLabelRef(HipStr<'static>); + +impl EntityLabelRef { + /// Wrap a name. + pub fn new(name: impl Into>) -> Self { + Self(name.into()) + } + + /// Wrap a `&'static str` without allocating. + #[must_use] + pub const fn from_static(name: &'static str) -> Self { + Self(HipStr::from_static(name)) + } + + /// Borrow the underlying name. + #[must_use] + pub fn as_str(&self) -> &str { + self.0.as_str() + } + + /// Borrow the inner [`HipStr`]. + #[must_use] + pub fn as_hipstr(&self) -> &HipStr<'static> { + &self.0 + } + + /// Consume the ref and return the inner [`HipStr`]. + #[must_use] + pub fn into_hipstr(self) -> HipStr<'static> { + self.0 + } + + /// `true` when this ref names `name` byte-for-byte. + #[must_use] + pub fn matches(&self, name: &str) -> bool { + self.0 == name + } +} + +impl AsRef for EntityLabelRef { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl fmt::Display for EntityLabelRef { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt::Display::fmt(&self.0, f) + } +} + +impl From> for EntityLabelRef { + fn from(value: HipStr<'static>) -> Self { + Self(value) + } +} + +impl From for HipStr<'static> { + fn from(value: EntityLabelRef) -> Self { + value.0 + } +} + +impl From<&'static str> for EntityLabelRef { + fn from(value: &'static str) -> Self { + Self::from_static(value) + } +} + +impl From for EntityLabelRef { + fn from(value: String) -> Self { + Self(HipStr::from(value)) + } +} + +impl PartialEq for EntityLabelRef { + fn eq(&self, other: &str) -> bool { + self.0 == other + } +} + +impl PartialEq<&str> for EntityLabelRef { + fn eq(&self, other: &&str) -> bool { + self.0 == *other + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn from_static_no_alloc() { + let r = EntityLabelRef::from_static("payment_card"); + assert_eq!(r.as_str(), "payment_card"); + assert!(r.matches("payment_card")); + assert!(!r.matches("person_name")); + } + + #[test] + fn equality_with_str() { + let r = EntityLabelRef::from_static("email_address"); + assert_eq!(r, "email_address"); + } + + #[test] + fn serde_transparent() { + let r = EntityLabelRef::from_static("ssn"); + let s = serde_json::to_string(&r).unwrap(); + assert_eq!(s, "\"ssn\""); + let back: EntityLabelRef = serde_json::from_str(&s).unwrap(); + assert_eq!(back, r); + } +} diff --git a/crates/nvisy-core/src/entity/label/mod.rs b/crates/nvisy-core/src/entity/label/mod.rs new file mode 100644 index 00000000..abfd2aff --- /dev/null +++ b/crates/nvisy-core/src/entity/label/mod.rs @@ -0,0 +1,28 @@ +//! Entity label types and catalog. +//! +//! Four concerns split across this folder: +//! +//! - [`EntityLabel`] — full catalog entry (name + description + +//! tags). Authored once per label; consumed by selectors and +//! audit-rendering tooling that need the metadata. +//! - [`EntityLabelRef`] — name-only handle stored on every +//! detected [`Entity`][crate::entity::Entity]. Cheap-clone +//! wrapper around [`HipStr<'static>`][hipstr::HipStr]. +//! - [`EntityLabelCatalog`] — name-indexed lookup over a collection of +//! `EntityLabel`s. The workspace ships a built-in catalog +//! constructed from [`EntityLabelCatalog::with_builtins`]; consumers can +//! register custom labels alongside or instead of the +//! built-ins. +//! - [`builtins`] — every built-in `EntityLabel` constant +//! (`builtins::PERSON_NAME`, `builtins::EMAIL_ADDRESS`, …) plus +//! the internal `BUILT_INS` slice the catalog walks at +//! construction time. + +pub mod builtins; +mod entity_label; +mod entity_label_catalog; +mod entity_label_ref; + +pub use self::entity_label::EntityLabel; +pub use self::entity_label_catalog::EntityLabelCatalog; +pub use self::entity_label_ref::EntityLabelRef; diff --git a/crates/nvisy-core/src/entity/method/provenance.rs b/crates/nvisy-core/src/entity/method/provenance.rs index f2cf834e..712f154d 100644 --- a/crates/nvisy-core/src/entity/method/provenance.rs +++ b/crates/nvisy-core/src/entity/method/provenance.rs @@ -5,10 +5,7 @@ use serde::{Deserialize, Serialize}; /// Provenance for a pattern-based detection (regex, dictionary, /// deny-list). Each variant carries only the fields meaningful for -/// that matcher — the old flat `PatternKind` + `Option` -/// representation allowed invalid combinations (a `Regex` row with -/// no pattern name, a `DenyList` row with a stale validator) that -/// can't be constructed in this shape. +/// that matcher. #[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Serialize, Deserialize, JsonSchema)] #[serde(tag = "kind", rename_all = "snake_case")] diff --git a/crates/nvisy-core/src/entity/mod.rs b/crates/nvisy-core/src/entity/mod.rs index 3c23f273..efef6d5f 100644 --- a/crates/nvisy-core/src/entity/mod.rs +++ b/crates/nvisy-core/src/entity/mod.rs @@ -17,8 +17,7 @@ //! [`AuditEntry`]: https://docs.rs/nvisy-engine/latest/nvisy_engine/provenance/struct.AuditEntry.html mod annotation; -mod category; -mod kind; +pub mod label; mod method; mod source; @@ -30,8 +29,7 @@ use uuid::Uuid; pub use self::annotation::{ Annotation, AnnotationKind, AnnotationStrength, LabelAnnotation, is_excluded, }; -pub use self::category::EntityCategory; -pub use self::kind::EntityKind; +pub use self::label::{EntityLabel, EntityLabelCatalog, EntityLabelRef, builtins}; pub use self::method::{ AnnotationProvenance, ModelProvenance, PatternProvenance, TrailProvenance, TrailStep, TrailStepKind, @@ -44,11 +42,8 @@ use crate::primitive::{Confidence, LanguageTag}; /// A detected sensitive data occurrence within a document. /// -/// The category for an entity is derived from its [`entity_kind`] via -/// [`EntityKind::category`]; it is not stored separately. The trail -/// of score-affecting steps lives on [`trail`]. +/// The trail of score-affecting steps lives on [`trail`]. /// -/// [`entity_kind`]: Self::entity_kind /// [`trail`]: Self::trail #[derive(Debug, Clone, PartialEq, Builder)] #[derive(Serialize, Deserialize, JsonSchema)] @@ -68,9 +63,11 @@ pub struct Entity { #[builder(default, setter(into = false))] #[serde(default, skip_serializing_if = "Option::is_none")] pub entity_id: Option, - /// Specific entity kind. The broad [`EntityCategory`] is derived - /// via [`Entity::category`]. - pub entity_kind: EntityKind, + /// Open-vocabulary classification of the entity. Wraps the + /// label's identifying name; full catalog metadata + /// (description, tags) is dereferenced through an + /// [`EntityLabelCatalog`]. + pub label: EntityLabelRef, /// Modality-specific location of the entity within the document. pub location: M::Location, /// Detection confidence score in the range `[0.0, 1.0]`. Equals @@ -100,12 +97,6 @@ impl Entity { EntityBuilder::default() } - /// Derived broad classification — `self.entity_kind.category()`. - #[must_use] - pub fn category(&self) -> EntityCategory { - self.entity_kind.category() - } - /// Original recognition score, before any post-recognition /// adjustments. Reads from the first step's `original` (or /// `adjusted` if it had none), returning `None` only if the @@ -142,7 +133,9 @@ impl Entity { pub fn test_builder(start: usize, end: usize) -> EntityBuilder { let conf = Confidence::clamped(0.9); Entity::builder() - .with_entity_kind(EntityKind::PersonName) + .with_label(EntityLabelRef::from( + self::builtins::PERSON_NAME.name.clone(), + )) .with_trail(vec![TrailStep::recognition( "pattern", conf, diff --git a/crates/nvisy-core/src/primitive/confidence/value.rs b/crates/nvisy-core/src/primitive/confidence/value.rs index 52d8c706..540fc7e4 100644 --- a/crates/nvisy-core/src/primitive/confidence/value.rs +++ b/crates/nvisy-core/src/primitive/confidence/value.rs @@ -200,5 +200,4 @@ mod tests { assert!(serde_json::from_str::("1.4").is_err()); assert!(serde_json::from_str::("-0.2").is_err()); } - } diff --git a/crates/nvisy-core/src/recognition/hint.rs b/crates/nvisy-core/src/recognition/hint.rs index 081149eb..1f9fa9c7 100644 --- a/crates/nvisy-core/src/recognition/hint.rs +++ b/crates/nvisy-core/src/recognition/hint.rs @@ -18,7 +18,7 @@ //! [`Entity`]: crate::entity::Entity //! [`RecognizerInput::hints`]: super::RecognizerInput::hints -use crate::entity::EntityKind; +use crate::entity::EntityLabelRef; use crate::modality::Modality; /// Uploader-supplied annotation region in modality-native @@ -29,20 +29,22 @@ pub struct Hint { /// confirm or relocate this hint forward the name into the /// emitted entity's recognition trail step. pub name: Option, - /// Uploader-claimed entity kind (optional). - pub entity_kind: Option, + /// Uploader-claimed label (optional). When set, recognizers + /// that confirm the hint stamp this on the emitted entity's + /// `label` field. + pub label: Option, /// Region in modality-native coordinates. pub location: M::Location, } impl Hint { - /// Construct a hint with only the location set; name and kind + /// Construct a hint with only the location set; name and label /// default to `None`. #[must_use] pub fn new(location: M::Location) -> Self { Self { name: None, - entity_kind: None, + label: None, location, } } @@ -54,10 +56,10 @@ impl Hint { self } - /// Attach an uploader-claimed entity kind. + /// Attach an uploader-claimed label. #[must_use] - pub fn with_entity_kind(mut self, entity_kind: EntityKind) -> Self { - self.entity_kind = Some(entity_kind); + pub fn with_label(mut self, label: impl Into) -> Self { + self.label = Some(label.into()); self } } diff --git a/crates/nvisy-core/src/recognition/label_map.rs b/crates/nvisy-core/src/recognition/label_map.rs index cb742dd5..b19fcd34 100644 --- a/crates/nvisy-core/src/recognition/label_map.rs +++ b/crates/nvisy-core/src/recognition/label_map.rs @@ -1,98 +1,114 @@ -//! [`LabelMap`]: model-label → [`EntityKind`] translation table. +//! [`LabelMap`]: backend label → canonical label-name translation. //! //! Shared translation table used by every model-driven recognizer //! (NER backends, LLM recognizers, …). Lets a recognizer consume //! raw model labels uniformly regardless of which backend produced //! them — swap backends without re-implementing translation. //! -//! The map is bidirectional in spirit (look up an [`EntityKind`] to -//! find the canonical label a backend should be asked for) but the -//! primary path is label→kind. The reverse lookup is a linear -//! scan; if a future backend needs frequent reverse lookups we'll -//! cache both directions. +//! The map is bidirectional in spirit (look up an entity label +//! name to find the canonical model label a backend should be asked +//! for) but the primary path is model-label → entity-label-name. +//! The reverse lookup is a linear scan. use std::borrow::Cow; use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use strum::IntoEnumIterator; -use crate::entity::EntityKind; +use crate::entity::{EntityLabelCatalog, EntityLabelRef}; -/// Translation table from raw model labels to canonical -/// [`EntityKind`] values. +/// Translation table from raw model labels to entity label names. /// -/// The default ([`LabelMap::canonical`]) maps every `EntityKind`'s -/// snake_case string form to itself, so backends that already -/// return canonical labels (the Bento `inference-gliner` today) -/// pass through unchanged. Custom backends register their own -/// model-specific labels via [`with_entry`]. +/// The default ([`LabelMap::canonical`]) maps every name in a +/// [`EntityLabelCatalog`] to itself, so backends that already return canonical +/// names pass through unchanged. Custom backends register their +/// own model-specific labels via [`with_entry`]. /// /// [`with_entry`]: Self::with_entry #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] #[serde(transparent)] pub struct LabelMap { - entries: HashMap, + entries: HashMap, } impl LabelMap { /// Empty map. Backends with no recognizable labels see every - /// span dropped — typically you want - /// [`canonical`] instead. - /// - /// [`canonical`]: Self::canonical + /// span dropped — typically you want [`Self::canonical`] / + /// [`Self::canonical_from`] instead. #[must_use] pub fn new() -> Self { Self::default() } - /// Identity map: every [`EntityKind`] mapped to its own - /// canonical snake_case label. Use when the backend already - /// returns canonical labels. + /// Identity map from every label name in the workspace + /// built-in [`EntityLabelCatalog`] to itself. Convenience wrapper around + /// [`Self::canonical_from`] for callers that don't need custom + /// labels. #[must_use] pub fn canonical() -> Self { - let entries = EntityKind::iter() - .map(|kind| (kind.to_string(), kind)) + Self::canonical_from(&EntityLabelCatalog::with_builtins()) + } + + /// Identity map over every name in the supplied catalog. + /// Backends that already return canonical names — or that + /// have been pre-registered with the catalog — pass through + /// unchanged. + #[must_use] + pub fn canonical_from(catalog: &EntityLabelCatalog) -> Self { + let entries = catalog + .iter() + .map(|label| { + ( + label.name.to_string(), + EntityLabelRef::from(label.name.clone()), + ) + }) .collect(); Self { entries } } - /// Register one label→kind entry. Last write wins on duplicate - /// labels. + /// Register one model-label → entity-label-ref entry. Last + /// write wins on duplicates. #[must_use] - pub fn with_entry(mut self, label: impl Into>, kind: EntityKind) -> Self { - self.entries.insert(label.into().into_owned(), kind); + pub fn with_entry( + mut self, + model_label: impl Into>, + entity_label: impl Into, + ) -> Self { + self.entries + .insert(model_label.into().into_owned(), entity_label.into()); self } /// Register many entries. #[must_use] - pub fn with_entries(mut self, entries: I) -> Self + pub fn with_entries(mut self, entries: I) -> Self where - I: IntoIterator, - S: Into, + I: IntoIterator, + K: Into, + V: Into, { - for (label, kind) in entries { - self.entries.insert(label.into(), kind); + for (model_label, entity_label) in entries { + self.entries.insert(model_label.into(), entity_label.into()); } self } - /// Look up a raw label. `None` when not registered. + /// Look up a raw model label. `None` when not registered. #[must_use] - pub fn lookup(&self, label: &str) -> Option { - self.entries.get(label).copied() + pub fn lookup(&self, model_label: &str) -> Option<&EntityLabelRef> { + self.entries.get(model_label) } - /// Find a label string that maps to `kind`. Linear scan; - /// returns the first match. Used by zero-shot backends that - /// need to format requested-kinds as raw labels for the - /// service. + /// Find a model label that maps to the given entity label + /// name. Linear scan; returns the first match. Used by + /// zero-shot backends that need to format requested-labels as + /// raw model labels for the service. #[must_use] - pub fn label_for(&self, kind: EntityKind) -> Option<&str> { + pub fn model_label_for(&self, entity_label: &str) -> Option<&str> { self.entries .iter() - .find_map(|(label, k)| (*k == kind).then_some(label.as_str())) + .find_map(|(m, e)| (e.as_str() == entity_label).then_some(m.as_str())) } /// Number of registered entries. @@ -111,26 +127,30 @@ impl LabelMap { #[cfg(test)] mod tests { use super::*; + use crate::entity::builtins; #[test] fn canonical_map_resolves_known_labels() { let map = LabelMap::canonical(); - assert_eq!(map.lookup("email_address"), Some(EntityKind::EmailAddress)); - assert_eq!(map.lookup("ssn"), None); + assert_eq!( + map.lookup("email_address").map(|r| r.as_str()), + Some("email_address") + ); + assert!(map.lookup("ssn").is_none()); } #[test] fn custom_entries_override_canonical() { - let map = LabelMap::canonical().with_entry("PER", EntityKind::PersonName); - assert_eq!(map.lookup("PER"), Some(EntityKind::PersonName)); + let map = LabelMap::canonical().with_entry( + "PER", + EntityLabelRef::from(builtins::PERSON_NAME.name.clone()), + ); + assert_eq!(map.lookup("PER").map(|r| r.as_str()), Some("person_name")); } #[test] - fn label_for_round_trips() { + fn model_label_for_round_trips() { let map = LabelMap::canonical(); - assert_eq!( - map.label_for(EntityKind::EmailAddress), - Some("email_address") - ); + assert_eq!(map.model_label_for("email_address"), Some("email_address")); } } diff --git a/crates/nvisy-engine/src/phases/detection.rs b/crates/nvisy-engine/src/phases/detection.rs index a2f1c74b..69dd7cdf 100644 --- a/crates/nvisy-engine/src/phases/detection.rs +++ b/crates/nvisy-engine/src/phases/detection.rs @@ -153,7 +153,7 @@ where let detected = registry.run::(input).await?; for entity in detected { - if !cfg.entity_kinds.is_empty() && !cfg.entity_kinds.contains(&entity.entity_kind) { + if !cfg.labels.is_empty() && !cfg.labels.contains(&entity.label) { continue; } let Some(location) = @@ -161,7 +161,7 @@ where else { tracing::debug!( target: TARGET, - kind = %entity.entity_kind, + label = %entity.label, "dropping entity with no overlapping span", ); continue; @@ -169,7 +169,7 @@ where lifted.push(Entity { id: entity.id, entity_id: entity.entity_id, - entity_kind: entity.entity_kind, + label: entity.label, location, confidence: entity.confidence, trail: entity.trail, @@ -211,7 +211,7 @@ async fn detect_image_chunks( let detected = registry.run::(input).await?; let filtered: Vec> = detected .into_iter() - .filter(|e| cfg.entity_kinds.is_empty() || cfg.entity_kinds.contains(&e.entity_kind)) + .filter(|e| cfg.labels.is_empty() || cfg.labels.contains(&e.label)) .collect(); detected_total += filtered.len(); doc.add_entities(filtered); diff --git a/crates/nvisy-engine/src/phases/redaction/mod.rs b/crates/nvisy-engine/src/phases/redaction/mod.rs index 89fb75f7..84cdc7f3 100644 --- a/crates/nvisy-engine/src/phases/redaction/mod.rs +++ b/crates/nvisy-engine/src/phases/redaction/mod.rs @@ -49,8 +49,8 @@ use crate::document::provenance::{ AuditEntry, Decision as AuditDecision, EntryMetadata, Execution, }; use crate::modality::DocumentModality; -use crate::policy::{Action, PolicyDecisionRef}; use crate::policy::redaction::Instantiate; +use crate::policy::{Action, PolicyDecisionRef}; pub(crate) const TARGET: &str = "nvisy_engine::redaction"; diff --git a/crates/nvisy-engine/src/pipeline/config/detection/mod.rs b/crates/nvisy-engine/src/pipeline/config/detection/mod.rs index 3c50602b..6a1db8ba 100644 --- a/crates/nvisy-engine/src/pipeline/config/detection/mod.rs +++ b/crates/nvisy-engine/src/pipeline/config/detection/mod.rs @@ -18,7 +18,7 @@ mod plan; #[cfg(not(feature = "bento"))] use nvisy_core::Error; use nvisy_core::Result; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::{EntityLabelCatalog, EntityLabelRef}; use nvisy_core::modality::Text; use nvisy_ner::NerRecognizer; use nvisy_ner::backend::NoopBackend; @@ -87,7 +87,7 @@ impl DetectionConfig { let recognizer = NerRecognizer::builder() .with_name(NER_RECOGNIZER_NAME) .with_engine(NoopBackend) - .with_supported_kinds(default_text_kinds()) + .with_supported_labels(default_text_labels()) .build()?; reg.with_recognizer::(recognizer) } @@ -98,7 +98,7 @@ impl DetectionConfig { let recognizer = NerRecognizer::builder() .with_name(NER_RECOGNIZER_NAME) .with_engine(backend) - .with_supported_kinds(default_text_kinds()) + .with_supported_labels(default_text_labels()) .build()?; reg.with_recognizer::(recognizer) } @@ -117,16 +117,16 @@ impl DetectionConfig { } } -/// Default kind allowlist for the engine-side NER recognizer. +/// Default label allowlist for the engine-side NER recognizer. /// -/// Every defined [`EntityKind`] except those that only surface in -/// images (biometric templates, visual elements). The zero-shot model -/// is fed this list as "look for any of these"; centralised +/// Every built-in label except those that only surface in images +/// (biometric templates, visual elements). The zero-shot model is +/// fed this list as "look for any of these"; centralised /// post-filtering at the dispatch layer narrows further per call. -/// -/// [`EntityKind`]: nvisy_core::entity::EntityKind -fn default_text_kinds() -> Vec { - EntityKind::all() - .filter(|k| !k.is_biometric() && !k.is_visual()) +fn default_text_labels() -> Vec { + EntityLabelCatalog::with_builtins() + .iter() + .filter(|l| !l.has_tag("biometric") && !l.has_tag("visual")) + .map(|l| l.label_ref()) .collect() } diff --git a/crates/nvisy-engine/src/pipeline/config/detection/plan.rs b/crates/nvisy-engine/src/pipeline/config/detection/plan.rs index 9e2f7d04..0574513f 100644 --- a/crates/nvisy-engine/src/pipeline/config/detection/plan.rs +++ b/crates/nvisy-engine/src/pipeline/config/detection/plan.rs @@ -27,7 +27,7 @@ //! [`entity_kinds`]: Detection::entity_kinds //! [`RecognizerRegistry`]: crate::detection::RecognizerRegistry -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::EntityLabelRef; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use validator::Validate; @@ -36,10 +36,10 @@ use validator::Validate; #[derive(Debug, Clone, Default, PartialEq, Validate)] #[derive(Serialize, Deserialize, JsonSchema)] pub struct Detection { - /// Entity-kind allowlist applied to every dispatched recognizer. - /// Empty = all kinds permitted. + /// Entity-label allowlist applied to every dispatched recognizer. + /// Empty = all labels permitted. #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub entity_kinds: Vec, + pub labels: Vec, } impl Detection { diff --git a/crates/nvisy-engine/src/pipeline/redaction/applicator.rs b/crates/nvisy-engine/src/pipeline/redaction/applicator.rs index 9c5af220..17ab462d 100644 --- a/crates/nvisy-engine/src/pipeline/redaction/applicator.rs +++ b/crates/nvisy-engine/src/pipeline/redaction/applicator.rs @@ -28,7 +28,7 @@ use std::collections::{HashMap, HashSet}; use jiff::Timestamp; use nvisy_core::Error; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef}; use nvisy_core::modality::ModalityKind; use nvisy_core::primitive::Confidence; use uuid::Uuid; @@ -188,9 +188,7 @@ where ) })?; let prior = record.audit.take(); - let policy_ref = prior - .as_ref() - .and_then(|e| e.decision.policy_ref.clone()); + let policy_ref = prior.as_ref().and_then(|e| e.decision.policy_ref.clone()); record.audit = Some(AuditEntry { decision: Decision { policy_ref, @@ -211,23 +209,23 @@ where /// Append a synthesised entity to the matching audit. fn append_add(audits: &mut [AnyAudit], add: RedactionAddEntity) -> Result<(), Error> { + let kind = add.location.kind(); + // Optional pinned operator: if it disagrees with the // location's modality, fail. if let Some(op) = &add.operator - && op.modality() != add.location.kind() + && op.modality() != kind { return Err(Error::validation( format!( - "override Add operator modality {:?} differs from location modality {:?}", + "override Add operator modality {:?} differs from location modality {kind:?}", op.modality(), - add.location.kind(), ), TARGET, )); } // Find a target audit of matching modality. - let kind = add.location.kind(); let target = audits.iter_mut().find(|a| { matches!( (a, kind), @@ -240,44 +238,47 @@ fn append_add(audits: &mut [AnyAudit], add: RedactionAddEntity) -> Result<(), Er let Some(target) = target else { return Err(Error::validation( format!( - "override Add for modality {:?} has no audit of that modality in the detection", - add.location.kind(), + "override Add for modality {kind:?} has no audit of that modality in the detection", ), TARGET, )); }; - let entity_kind = add.entity_kind; - let operator = add.operator; - let location = add.location; match target { AnyAudit::Text(a) => { - let loc = location.try_as_text().ok_or_else(modality_mismatch)?; - let op = operator + let loc = add.location.try_as_text().ok_or_else(modality_mismatch)?; + let op = add + .operator .map(|o| o.try_as_text().ok_or_else(modality_mismatch)) .transpose()?; - append_typed(a, entity_kind, loc, op); + append_typed(a, add.label, loc, op); } AnyAudit::Tabular(a) => { - let loc = location.try_as_tabular().ok_or_else(modality_mismatch)?; - let op = operator + let loc = add + .location + .try_as_tabular() + .ok_or_else(modality_mismatch)?; + let op = add + .operator .map(|o| o.try_as_tabular().ok_or_else(modality_mismatch)) .transpose()?; - append_typed(a, entity_kind, loc, op); + append_typed(a, add.label, loc, op); } AnyAudit::Image(a) => { - let loc = location.try_as_image().ok_or_else(modality_mismatch)?; - let op = operator + let loc = add.location.try_as_image().ok_or_else(modality_mismatch)?; + let op = add + .operator .map(|o| o.try_as_image().ok_or_else(modality_mismatch)) .transpose()?; - append_typed(a, entity_kind, loc, op); + append_typed(a, add.label, loc, op); } AnyAudit::Audio(a) => { - let loc = location.try_as_audio().ok_or_else(modality_mismatch)?; - let op = operator + let loc = add.location.try_as_audio().ok_or_else(modality_mismatch)?; + let op = add + .operator .map(|o| o.try_as_audio().ok_or_else(modality_mismatch)) .transpose()?; - append_typed(a, entity_kind, loc, op); + append_typed(a, add.label, loc, op); } } Ok(()) @@ -295,7 +296,7 @@ fn modality_mismatch() -> Error { /// Append a synthesised entity to a typed audit. fn append_typed( audit: &mut Audit, - entity_kind: EntityKind, + label: EntityLabelRef, location: M::Location, operator: Option, ) where @@ -304,7 +305,7 @@ fn append_typed( let entity = Entity { id: Uuid::now_v7(), entity_id: None, - entity_kind, + label, location, confidence: Confidence::clamped(1.0), trail: Vec::new(), diff --git a/crates/nvisy-engine/src/pipeline/redaction/override_.rs b/crates/nvisy-engine/src/pipeline/redaction/override_.rs index 02c0967a..f7d82ee6 100644 --- a/crates/nvisy-engine/src/pipeline/redaction/override_.rs +++ b/crates/nvisy-engine/src/pipeline/redaction/override_.rs @@ -42,7 +42,7 @@ use std::collections::HashSet; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::EntityLabelRef; use nvisy_core::modality::ModalityKind; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -118,8 +118,8 @@ pub struct RedactionAddEntity { /// tag (`text`/`tabular`/`image`/`audio`) determines the /// entity's modality. pub location: AnyLocation, - /// Entity kind (drives policy evaluation and operator pick). - pub entity_kind: EntityKind, + /// Entity label (drives policy evaluation and operator pick). + pub label: EntityLabelRef, /// When `Some`, pins the operator the redaction will apply, /// bypassing the policy chain for this entity. Must match /// `location`'s modality; mismatches are rejected at diff --git a/crates/nvisy-engine/src/pipeline/redaction/pipeline.rs b/crates/nvisy-engine/src/pipeline/redaction/pipeline.rs index 206cf271..070100b3 100644 --- a/crates/nvisy-engine/src/pipeline/redaction/pipeline.rs +++ b/crates/nvisy-engine/src/pipeline/redaction/pipeline.rs @@ -196,5 +196,4 @@ impl RedactionPipeline { Ok(()) } - } diff --git a/crates/nvisy-engine/src/policy/mod.rs b/crates/nvisy-engine/src/policy/mod.rs index 164cfe4c..aab02908 100644 --- a/crates/nvisy-engine/src/policy/mod.rs +++ b/crates/nvisy-engine/src/policy/mod.rs @@ -209,10 +209,7 @@ pub struct PolicyDecisionRef { impl PolicyDecisionRef { /// Construct a reference from a policy + rule name. - pub fn new( - policy_name: HipStr<'static>, - rule_name: Option>, - ) -> Self { + pub fn new(policy_name: HipStr<'static>, rule_name: Option>) -> Self { Self { policy_name, rule_name, @@ -339,8 +336,7 @@ mod tests { let policies = vec![text_policy("gdpr", &["dup", "dup"])]; let err = validate_policy_namespace(&policies).unwrap_err(); assert!( - err.to_string() - .contains("duplicate rule name `dup`"), + err.to_string().contains("duplicate rule name `dup`"), "got: {err}" ); } diff --git a/crates/nvisy-engine/src/policy/selector.rs b/crates/nvisy-engine/src/policy/selector.rs index e16fb782..de20df1f 100644 --- a/crates/nvisy-engine/src/policy/selector.rs +++ b/crates/nvisy-engine/src/policy/selector.rs @@ -1,28 +1,47 @@ //! Entity selection criteria for policy rules. -use nvisy_core::entity::{Entity, EntityCategory, EntityKind}; +use std::sync::LazyLock; + +use hipstr::HipStr; +use nvisy_core::entity::{Entity, EntityLabelCatalog, EntityLabelRef}; use nvisy_core::primitive::ConfidenceThreshold; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; use crate::modality::DocumentModality; +/// Workspace-shipped built-in catalog. Selectors use this when no +/// custom catalog is plumbed in (selectors don't carry a catalog +/// today; tag matching dereferences against this catalog only). +static BUILTIN_CATALOG: LazyLock = + LazyLock::new(EntityLabelCatalog::with_builtins); + /// Criteria for selecting which entities a policy rule applies to. /// -/// All fields use "empty means all" semantics: an empty `categories` list -/// matches every category, an empty `entity_types` list matches every type, -/// and so on. When multiple fields are set, they are combined with AND logic. +/// All fields use "empty means all" semantics: an empty `labels` +/// list matches every label, an empty `tags` list matches every +/// tag, and so on. When multiple fields are set, they are combined +/// with AND logic. #[derive(Debug, Clone, Default, Serialize, Deserialize, JsonSchema)] #[serde(rename_all = "camelCase")] pub struct EntitySelector { - /// Entity categories this selector matches. Empty means all categories. + /// Specific entity labels this selector matches. Empty means + /// all labels. #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub entity_categories: Vec, - /// Specific entity kinds this selector matches. Empty means all kinds. + pub labels: Vec, + /// Tags this selector matches. An entity matches when its + /// label (looked up in the workspace built-in catalog) carries + /// any of the listed tags. Custom labels not registered in the + /// catalog never match a tag selector — they must be matched + /// by name via [`labels`]. + /// + /// [`labels`]: Self::labels #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub entity_kinds: Vec, - /// Minimum detection confidence required. Entities below this threshold - /// are not matched. `None` means no threshold (matches any confidence). + #[schemars(with = "Vec")] + pub tags: Vec>, + /// Minimum detection confidence required. Entities below this + /// threshold are not matched. `None` means no threshold + /// (matches any confidence). #[serde(default, skip_serializing_if = "Option::is_none")] pub confidence_threshold: Option, } @@ -40,15 +59,20 @@ impl EntitySelector { { return false; } - if !self.entity_categories.is_empty() - && !self.entity_categories.contains(&entity.category()) - { - return false; + if !self.tags.is_empty() { + let Some(catalog_entry) = BUILTIN_CATALOG.lookup(entity.label.as_str()) else { + // Custom label (not in the built-in catalog) — tags + // are catalog-side metadata, so the tag filter + // never matches a custom label. + return false; + }; + if !self.tags.iter().any(|t| catalog_entry.has_tag(t)) { + return false; + } } - if !self.entity_kinds.is_empty() && !self.entity_kinds.contains(&entity.entity_kind) { + if !self.labels.is_empty() && !self.labels.contains(&entity.label) { return false; } - true } } diff --git a/crates/nvisy-fake/src/anonymizer/mod.rs b/crates/nvisy-fake/src/anonymizer/mod.rs index 7854b514..e1f0f456 100644 --- a/crates/nvisy-fake/src/anonymizer/mod.rs +++ b/crates/nvisy-fake/src/anonymizer/mod.rs @@ -8,7 +8,7 @@ use std::sync::Arc; use fake::rand::SeedableRng; use fake::rand::rngs::SmallRng; use nvisy_core::Result; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::Entity; use nvisy_core::modality::{Modality, Tabular, Text, TextData, TextLocation}; use nvisy_core::primitive::LanguageTag; use nvisy_core::redaction::{Anonymizer, LeakProfile, TabularReplacement, TextReplacement}; @@ -94,17 +94,17 @@ impl Fake { SmallRng::seed_from_u64(hasher.finish()) } - /// Try the generator for `kind`; return `None` if it has no + /// Try the generator for `label`; return `None` if it has no /// entry, so the caller can delegate to the fallback. fn try_generate( &self, locale: Locale, - kind: EntityKind, + label: &str, identity: Identity<'_>, source: &str, ) -> Option { let mut rng = self.rng_for(identity); - generator::Context::new(locale, kind, source).generate(&mut rng) + generator::Context::new(locale, label, source).generate(&mut rng) } } @@ -121,7 +121,7 @@ impl Anonymizer for Fake { let locale = self.locale_for(entity.language.as_ref()); match self.try_generate( locale, - entity.entity_kind, + entity.label.as_str(), Identity::from(entity), source.text.as_str(), ) { @@ -145,7 +145,7 @@ impl Anonymizer for Fake { let locale = self.locale_for(entity.language.as_ref()); if let Some(value) = self.try_generate( locale, - entity.entity_kind, + entity.label.as_str(), Identity::from(entity), source.text.as_str(), ) { @@ -171,7 +171,7 @@ impl Anonymizer for Fake { fn adapt_to_text(entity: &Entity) -> Entity { let mut builder = Entity::::builder() .with_id(entity.id) - .with_entity_kind(entity.entity_kind) + .with_label(entity.label.clone()) .with_location(TextLocation::new(0, 0)) .with_confidence(entity.confidence) .with_trail(entity.trail.clone()); @@ -223,7 +223,7 @@ impl Hash for Identity<'_> { mod tests { use std::collections::HashSet; - use nvisy_core::entity::EntityKind; + use nvisy_core::entity::{EntityLabelRef, builtins}; use nvisy_core::redaction::Anonymizer as _; use nvisy_toolkit::redaction::anonymizer::{Mask, Replace}; @@ -236,15 +236,15 @@ mod tests { /// Build an entity whose location spans the entire `source` /// string. Tests don't care about offsets — making them match /// the source makes them self-documenting. - fn entity_over(kind: EntityKind, source: &str) -> Entity { + fn entity_over(label: EntityLabelRef, source: &str) -> Entity { Entity::test_builder(0, source.len()) - .with_entity_kind(kind) + .with_label(label) .test_build() } - fn coref_entity(kind: EntityKind, source: &str, coref_id: &str) -> Entity { + fn coref_entity(label: EntityLabelRef, source: &str, coref_id: &str) -> Entity { Entity::test_builder(0, source.len()) - .with_entity_kind(kind) + .with_label(label) .with_entity_id(coref_id.to_string()) .test_build() } @@ -256,7 +256,7 @@ mod tests { // fallback anonymizer. let op = Fake::new(Replace::new("[redacted]")); let source = TextData::new("hypertension"); - let entity = entity_over(EntityKind::Diagnosis, source.text.as_str()); + let entity = entity_over(builtins::DIAGNOSIS.label_ref(), source.text.as_str()); let out = op.apply(&entity, &source).await.unwrap(); assert_eq!(out, TextReplacement::substituted("[redacted]")); } @@ -265,7 +265,7 @@ mod tests { async fn fallback_can_be_mask() { let op = Fake::new(Mask::stars()); let source = TextData::new("hypertension"); - let entity = entity_over(EntityKind::Diagnosis, source.text.as_str()); + let entity = entity_over(builtins::DIAGNOSIS.label_ref(), source.text.as_str()); let out = op.apply(&entity, &source).await.unwrap(); assert_eq!(out, TextReplacement::substituted("************")); } @@ -274,7 +274,7 @@ mod tests { async fn same_seed_and_entity_id_produces_same_output() { let op = fake().with_seed(42); let source = TextData::new("alice"); - let entity = entity_over(EntityKind::PersonName, source.text.as_str()); + let entity = entity_over(builtins::PERSON_NAME.label_ref(), source.text.as_str()); let a = op.apply(&entity, &source).await.unwrap(); let b = op.apply(&entity, &source).await.unwrap(); assert_eq!(a, b); @@ -284,7 +284,7 @@ mod tests { async fn entity_language_overrides_default() { let op = fake(); let source = TextData::new("名前"); - let mut entity = entity_over(EntityKind::PersonName, source.text.as_str()); + let mut entity = entity_over(builtins::PERSON_NAME.label_ref(), source.text.as_str()); entity.language = Some("ja".parse().unwrap()); let out = op.apply(&entity, &source).await.unwrap(); let TextReplacement::Substituted { value } = out else { @@ -297,7 +297,7 @@ mod tests { async fn default_language_applies_when_entity_unlanguaged() { let op = fake().with_default_language("ja".parse().unwrap()); let source = TextData::new("name"); - let entity = entity_over(EntityKind::PersonName, source.text.as_str()); + let entity = entity_over(builtins::PERSON_NAME.label_ref(), source.text.as_str()); let out = op.apply(&entity, &source).await.unwrap(); let TextReplacement::Substituted { value } = out else { panic!("expected Substituted variant"); @@ -309,12 +309,24 @@ mod tests { async fn coreferent_entities_collapse_to_same_fake() { let op = fake(); let source = TextData::new("alice"); - let a = coref_entity(EntityKind::PersonName, source.text.as_str(), "ENTITY_42"); - let b = coref_entity(EntityKind::PersonName, source.text.as_str(), "ENTITY_42"); + let a = coref_entity( + builtins::PERSON_NAME.label_ref(), + source.text.as_str(), + "ENTITY_42", + ); + let b = coref_entity( + builtins::PERSON_NAME.label_ref(), + source.text.as_str(), + "ENTITY_42", + ); let out_a = op.apply(&a, &source).await.unwrap(); let out_b = op.apply(&b, &source).await.unwrap(); assert_eq!(out_a, out_b); - let c = coref_entity(EntityKind::PersonName, source.text.as_str(), "ENTITY_99"); + let c = coref_entity( + builtins::PERSON_NAME.label_ref(), + source.text.as_str(), + "ENTITY_99", + ); let out_c = op.apply(&c, &source).await.unwrap(); assert_ne!( out_a, out_c, @@ -328,7 +340,7 @@ mod tests { let source = TextData::new("seed"); let mut outputs: HashSet = HashSet::new(); for _ in 0..32 { - let entity = entity_over(EntityKind::PersonName, source.text.as_str()); + let entity = entity_over(builtins::PERSON_NAME.label_ref(), source.text.as_str()); let out = op.apply(&entity, &source).await.unwrap(); let TextReplacement::Substituted { value } = out else { panic!("expected Substituted"); @@ -346,7 +358,7 @@ mod tests { async fn tabular_impl_emits_substituted() { let op = fake(); let entity = Entity::::builder() - .with_entity_kind(EntityKind::PersonName) + .with_label(builtins::PERSON_NAME.label_ref()) .with_location(nvisy_core::modality::TabularLocation { row_index: 0u32, column_index: 0u32, diff --git a/crates/nvisy-fake/src/generator/mod.rs b/crates/nvisy-fake/src/generator/mod.rs index e6453b8f..25948e50 100644 --- a/crates/nvisy-fake/src/generator/mod.rs +++ b/crates/nvisy-fake/src/generator/mod.rs @@ -1,17 +1,17 @@ -//! Per-[`EntityKind`] fake-value generation, dispatched by [`Locale`]. +//! Per-label fake-value generation, dispatched by [`Locale`]. //! -//! [`Context::generate`] returns `Some(string)` for every entity -//! kind the catalogue covers, or `None` for kinds the fake-data -//! layer doesn't support — the caller delegates to its fallback +//! [`Context::generate`] returns `Some(string)` for every label the +//! catalogue covers, or `None` for labels the fake-data layer +//! doesn't support — the caller delegates to its fallback //! anonymizer in that case. //! //! Two paths: //! -//! - **Structured kinds** (IBAN, payment cards, dates, IPs, …) +//! - **Structured labels** (IBAN, payment cards, dates, IPs, …) //! pattern-preserve the original string: same length, same //! character-class layout, randomised digits and letters. //! See [`pattern::pattern_preserve`]. -//! - **Free-form kinds** (names, addresses, organisations, …) +//! - **Free-form labels** (names, addresses, organisations, …) //! emit a fresh locale-aware fake whose length doesn't need to //! match. These go through per-domain submodules. @@ -28,77 +28,93 @@ use fake::Fake; use fake::faker::number::raw as number; use fake::locales::EN; use fake::rand::RngExt; -use nvisy_core::entity::EntityKind; use crate::locale::Locale; -/// Per-call options threaded through to each kind generator. +/// Per-call options threaded through to each label generator. pub(crate) struct Context<'a> { locale: Locale, - kind: EntityKind, + label: &'a str, original: &'a str, } impl<'a> Context<'a> { /// Build a generation request. - pub(crate) fn new(locale: Locale, kind: EntityKind, original: &'a str) -> Self { + pub(crate) fn new(locale: Locale, label: &'a str, original: &'a str) -> Self { Self { locale, - kind, + label, original, } } /// Generate a fake replacement string for this context, using /// `rng` as the entropy source. Returns `None` when the entity - /// kind isn't covered. + /// label isn't covered. + /// + /// Two paths: + /// - **Structured** labels reshape the original string in place + /// via [`pattern::pattern_preserve`]; they return `None` when + /// `original` is empty since there's no pattern to copy. + /// - **Free-form** labels emit a fresh locale-aware fake whose + /// length doesn't need to match `original`. pub(crate) fn generate(self, rng: &mut R) -> Option { - // Structured kinds: scramble the original in place. Skip - // when source is empty — there's no pattern to copy from. - if self.kind.is_structured() { - if self.original.is_empty() { - return None; - } - return Some(pattern::pattern_preserve(self.original, rng)); - } - // Free-form kinds: locale-aware generator. - self.produce_free_form(rng) - } - - fn produce_free_form(&self, rng: &mut R) -> Option { let l = self.locale; - let value = match self.kind { - // identity - EntityKind::PersonName => identity::person_name(l, rng), - EntityKind::OrganizationName => identity::organization_name(l, rng), - EntityKind::Occupation => identity::occupation(l, rng), - EntityKind::Username => identity::username(l, rng), - EntityKind::Gender => identity::gender(l, rng), - EntityKind::Language => identity::language(rng), - EntityKind::Nationality => identity::nationality(l, rng), - EntityKind::Citizenship => identity::citizenship(l, rng), - - // contact (free-form subset) - EntityKind::Address => contact::street_address(l, rng), - EntityKind::Url => contact::url(l, rng), - - // temporal (only Age is free-form; DateOfBirth/DateTime - // are structured and pattern-preserved above). - EntityKind::Age => temporal::age(rng), + let preserve = |rng: &mut R| { + (!self.original.is_empty()).then(|| pattern::pattern_preserve(self.original, rng)) + }; + let value = match self.label { + // identity (free-form) + "person_name" => identity::person_name(l, rng), + "organization_name" => identity::organization_name(l, rng), + "occupation" => identity::occupation(l, rng), + "username" => identity::username(l, rng), + "gender" => identity::gender(l, rng), + "language" => identity::language(rng), + "nationality" => identity::nationality(l, rng), + "citizenship" => identity::citizenship(l, rng), + + // contact + "address" => contact::street_address(l, rng), + "url" => contact::url(l, rng), + "email_address" | "phone_number" | "postal_code" => return preserve(rng), + + // temporal + "age" => temporal::age(rng), + "date_of_birth" | "date_time" => return preserve(rng), // finance (free-form subset) - EntityKind::Currency => finance::currency_code(l, rng), - EntityKind::Amount => finance::amount(l, rng), - EntityKind::Quantity => finance::quantity(rng), - - // device (free-form subset: random tokens) - EntityKind::Password => device::password(l, rng), - EntityKind::ApiKey => device::api_key(rng), - EntityKind::AuthToken => device::auth_token(rng), - EntityKind::DeviceId => device::device_id(rng), - - // case ids - EntityKind::InternalId | EntityKind::CaseNumber => case_id::internal_id(rng), + "currency" => finance::currency_code(l, rng), + "amount" => finance::amount(l, rng), + "quantity" => finance::quantity(rng), + + // finance (structured) + "iban" | "payment_card" | "card_security_code" | "card_expiry" | "bank_account" + | "bank_routing" | "swift_code" | "crypto_address" => return preserve(rng), + + // device (free-form tokens) + "password" => device::password(l, rng), + "api_key" => device::api_key(rng), + "auth_token" => device::auth_token(rng), + "device_id" => device::device_id(rng), + + // device (structured) + "ip_address" | "mac_address" | "coordinates" => return preserve(rng), + + // case ids (free-form) + "internal_id" | "case_number" => case_id::internal_id(rng), + + // ids (structured) + "government_id" + | "tax_id" + | "drivers_license" + | "passport_number" + | "national_insurance_number" + | "vehicle_id" + | "license_plate" + | "medical_id" + | "insurance_id" + | "prescription_id" => return preserve(rng), _ => return None, }; @@ -106,7 +122,7 @@ impl<'a> Context<'a> { } } -/// Shared helper for kinds that synthesise digit groups outside +/// Shared helper for labels that synthesise digit groups outside /// the fake-rs locale tables (bank account, IDs). pub(crate) fn digits(len: usize, rng: &mut R) -> String { let fmt = "#".repeat(len); @@ -124,90 +140,48 @@ mod tests { SmallRng::seed_from_u64(7) } - fn ctx<'a>(locale: Locale, kind: EntityKind, original: &'a str) -> Context<'a> { - Context::new(locale, kind, original) + fn ctx<'a>(locale: Locale, label: &'a str, original: &'a str) -> Context<'a> { + Context::new(locale, label, original) } #[test] - fn unsupported_kinds_return_none() { + fn unsupported_labels_return_none() { let mut rng = rng(); - for kind in [ - EntityKind::Fingerprint, - EntityKind::Face, - EntityKind::Religion, - EntityKind::Diagnosis, - ] { + for label in ["fingerprint", "face", "religion", "diagnosis"] { assert!( - ctx(Locale::En, kind, "").generate(&mut rng).is_none(), - "{kind:?} should be None" + ctx(Locale::En, label, "").generate(&mut rng).is_none(), + "{label} should be None" ); } } #[test] - fn structured_kind_with_empty_source_returns_none() { + fn structured_label_with_empty_source_returns_none() { let mut rng = rng(); // No pattern to copy → can't pattern-preserve. - assert!( - ctx(Locale::En, EntityKind::Iban, "") - .generate(&mut rng) - .is_none() - ); + assert!(ctx(Locale::En, "iban", "").generate(&mut rng).is_none()); } #[test] - fn structured_kinds_preserve_original_shape() { - let cases: &[(EntityKind, &str)] = &[ - (EntityKind::Iban, "GB82WEST12345698765432"), - (EntityKind::PaymentCard, "4111-1111-1111-1111"), - (EntityKind::PhoneNumber, "+1-555-123-4567"), - (EntityKind::DateOfBirth, "1985-03-12"), - (EntityKind::IpAddress, "192.168.1.1"), - (EntityKind::PostalCode, "SW1A 1AA"), + fn structured_labels_preserve_original_shape() { + let cases: &[(&str, &str)] = &[ + ("iban", "GB82WEST12345698765432"), + ("payment_card", "4111-1111-1111-1111"), + ("phone_number", "+1-555-123-4567"), + ("date_of_birth", "1985-03-12"), + ("ip_address", "192.168.1.1"), + ("postal_code", "SW1A 1AA"), ]; - for &(kind, original) in cases { + for &(label, original) in cases { let mut rng = rng(); - let out = ctx(Locale::En, kind, original).generate(&mut rng).unwrap(); - assert_eq!(out.len(), original.len(), "{kind:?}: length mismatch"); + let out = ctx(Locale::En, label, original).generate(&mut rng).unwrap(); + assert_eq!(out.len(), original.len(), "{label}: length mismatch"); // Separator positions match. for (i, (a, b)) in out.chars().zip(original.chars()).enumerate() { if !a.is_ascii_alphanumeric() { - assert_eq!(a, b, "{kind:?}: separator mismatch at {i} ({a:?} vs {b:?})"); + assert_eq!(a, b, "{label}: separator mismatch at {i} ({a:?} vs {b:?})"); } } } } - - #[test] - fn free_form_kinds_return_non_empty() { - let kinds = [ - EntityKind::PersonName, - EntityKind::OrganizationName, - EntityKind::Occupation, - EntityKind::Username, - EntityKind::Gender, - EntityKind::Language, - EntityKind::Nationality, - EntityKind::Citizenship, - EntityKind::Address, - EntityKind::Url, - EntityKind::Age, - EntityKind::Currency, - EntityKind::Amount, - EntityKind::Quantity, - EntityKind::Password, - EntityKind::ApiKey, - EntityKind::AuthToken, - EntityKind::DeviceId, - EntityKind::InternalId, - EntityKind::CaseNumber, - ]; - for kind in kinds { - let mut rng = rng(); - let out = ctx(Locale::DeDe, kind, "") - .generate(&mut rng) - .unwrap_or_else(|| panic!("no value for {kind:?}")); - assert!(!out.is_empty(), "empty for {kind:?}"); - } - } } diff --git a/crates/nvisy-llm/src/recognition/candidates.rs b/crates/nvisy-llm/src/recognition/candidates.rs index 1ec5ef3f..195c7404 100644 --- a/crates/nvisy-llm/src/recognition/candidates.rs +++ b/crates/nvisy-llm/src/recognition/candidates.rs @@ -1,7 +1,6 @@ //! Structured-output candidate types — the typed schemas the model //! is asked to produce. -use nvisy_core::entity::EntityKind; use nvisy_core::primitive::NormalizedBoundingBox; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -26,9 +25,9 @@ pub(super) struct TextCandidate { /// entity. Stable across coreferent mentions within one call. #[serde(default)] pub entity_id: Option, - /// Specific entity type. Missing (`None`) means the model - /// declined to type the candidate; the recognizer drops these. - pub entity_type: Option, + /// Label name. Missing (`None`) means the model declined to + /// type the candidate; the recognizer drops these. + pub entity_type: Option, /// The matched text value — the literal surface form to flag. pub value: String, /// Model-asserted confidence in `[0.0, 1.0]`. @@ -58,7 +57,7 @@ pub(super) struct VlmCandidates { /// coordinates using the source image's dimensions. #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, JsonSchema)] pub(super) struct VlmCandidate { - pub entity_kind: EntityKind, + pub label: String, #[serde(flatten)] pub bbox: NormalizedBoundingBox, #[serde(default)] diff --git a/crates/nvisy-llm/src/recognition/file_prompt.rs b/crates/nvisy-llm/src/recognition/file_prompt.rs index 7e65bc5f..510694da 100644 --- a/crates/nvisy-llm/src/recognition/file_prompt.rs +++ b/crates/nvisy-llm/src/recognition/file_prompt.rs @@ -14,8 +14,8 @@ //! name = "ner-default" //! modality = "text" # or "image" //! -//! # Optional. Maps model-emitted labels to canonical EntityKind. -//! # Use snake_case EntityKind names on the right-hand side. +//! # Optional. Maps model-emitted labels to canonical entity +//! # labels. Use snake_case label names on the right-hand side. //! [label_map] //! person = "person_name" //! email = "email_address" @@ -48,7 +48,7 @@ use std::path::Path; use base64::Engine; use base64::engine::general_purpose::STANDARD; use minijinja::{Environment, context}; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::Entity; use nvisy_core::modality::{Image, Text}; use nvisy_core::recognition::{LabelMap, RecognizerInput}; use nvisy_core::{Error, Result}; @@ -113,14 +113,11 @@ impl FilePrompt { let mut label_map = LabelMap::new(); if let Some(entries) = parsed.label_map { - for (label, kind_str) in entries { - let kind = kind_str.parse::().map_err(|_| { - Error::validation( - format!("unknown EntityKind {kind_str:?} in label_map"), - "file-prompt", - ) - })?; - label_map = label_map.with_entry(label, kind); + for (model_label, entity_label) in entries { + label_map = label_map.with_entry( + model_label, + nvisy_core::entity::EntityLabelRef::from(entity_label), + ); } } @@ -158,8 +155,8 @@ impl FilePrompt { /// # Errors /// /// Returns a validation error when the file is missing, malformed, - /// declares a non-`text` modality, references an unknown - /// `EntityKind`, or contains an invalid Jinja2 template. + /// declares a non-`text` modality, or contains an invalid Jinja2 + /// template. pub fn from_toml_file(path: impl AsRef) -> Result { let raw = fs::read_to_string(path.as_ref()) .map_err(|e| Error::validation(format!("reading prompt file: {e}"), "file-prompt"))?; @@ -215,7 +212,7 @@ impl Prompt for FilePrompt { let snippet = snippet_around(text, h.location.start, h.location.end); context! { name => h.name.as_deref().unwrap_or(""), - kind => h.entity_kind.map(|k| k.to_string()).unwrap_or_else(|| "unknown".to_owned()), + kind => h.label.as_ref().map(|l| l.to_string()).unwrap_or_else(|| "unknown".to_owned()), value => value, snippet => snippet, } @@ -259,7 +256,7 @@ impl Prompt for FilePrompt { let bbox = &h.location.bounding_box; context! { name => h.name.as_deref().unwrap_or(""), - kind => h.entity_kind.map(|k| k.to_string()).unwrap_or_else(|| "unknown".to_owned()), + kind => h.label.as_ref().map(|l| l.to_string()).unwrap_or_else(|| "unknown".to_owned()), bbox => context! { x => bbox.x, y => bbox.y, diff --git a/crates/nvisy-llm/src/recognition/lift.rs b/crates/nvisy-llm/src/recognition/lift.rs index ba4ccef4..10f39e56 100644 --- a/crates/nvisy-llm/src/recognition/lift.rs +++ b/crates/nvisy-llm/src/recognition/lift.rs @@ -4,14 +4,14 @@ //! Both prompts produce the same `{TextCandidates,VlmCandidates}` //! JSON shape and walk it the same way to emit entities. The only //! axis they differ on is whether they apply a [`LabelMap`] + -//! `labels_to_ignore` filter on the model's emitted kind label — +//! `labels_to_ignore` filter on the model's emitted label string — //! [`DefaultPrompt`] passes an empty map + empty slice (no //! filtering); [`FilePrompt`] passes its loaded config. //! //! [`DefaultPrompt`]: super::default_prompt::DefaultPrompt //! [`FilePrompt`]: super::file_prompt::FilePrompt -use nvisy_core::entity::{Entity, EntityKind, ModelProvenance, TrailProvenance, TrailStep}; +use nvisy_core::entity::{Entity, EntityLabelRef, ModelProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Image, ImageLocation, Text, TextLocation}; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{LabelMap, RecognizerInput}; @@ -26,7 +26,7 @@ const DEFAULT_CONFIDENCE: f64 = 0.5; /// Lift a parsed text-candidate batch into `Entity` values. /// /// `label_map` and `labels_to_ignore` together implement the -/// model-label → canonical-kind translation. Pass an empty map + an +/// model-label → canonical-name translation. Pass an empty map + an /// empty slice for no filtering. pub(super) fn lift_text( input: &RecognizerInput, @@ -40,8 +40,8 @@ pub(super) fn lift_text( let mut out = Vec::with_capacity(localized.len()); for l in localized { - let Some(entity_kind) = resolve_text_kind( - l.candidate.entity_type, + let Some(label) = resolve_text_label( + l.candidate.entity_type.as_deref(), l.candidate.value.as_str(), label_map, labels_to_ignore, @@ -53,7 +53,7 @@ pub(super) fn lift_text( continue; }; let location = TextLocation::new(l.start_offset, l.end_offset); - let reason = format!("llm identified {entity_kind}"); + let reason = format!("llm identified {label}"); let step = TrailStep::recognition( "llm-ner", confidence, @@ -62,7 +62,7 @@ pub(super) fn lift_text( ); let mut b = Entity::builder() - .with_entity_kind(entity_kind) + .with_label(label) .with_trail(vec![step]) .with_confidence(confidence) .with_location(location); @@ -86,18 +86,20 @@ pub(super) fn lift_image( let mut out = Vec::with_capacity(candidates.len()); for d in candidates { - let kind_str = d.entity_kind.to_string(); - if labels_to_ignore.iter().any(|l| l == &kind_str) { + if labels_to_ignore.iter().any(|l| l == &d.label) { continue; } - let entity_kind = label_map.lookup(&kind_str).unwrap_or(d.entity_kind); + let label = label_map + .lookup(&d.label) + .cloned() + .unwrap_or_else(|| EntityLabelRef::from(d.label.clone())); let raw = d.confidence.unwrap_or(DEFAULT_CONFIDENCE); let Some(confidence) = Confidence::new(raw.clamp(0.0, 1.0)) else { continue; }; let bbox = d.bbox.to_pixel(dims); let location = ImageLocation::new(bbox); - let reason = format!("vlm identified {entity_kind}"); + let reason = format!("vlm identified {label}"); let step = TrailStep::recognition( "llm-vlm", confidence, @@ -105,7 +107,7 @@ pub(super) fn lift_image( reason, ); let entity = Entity::builder() - .with_entity_kind(entity_kind) + .with_label(label) .with_trail(vec![step]) .with_confidence(confidence) .with_location(location) @@ -116,26 +118,30 @@ pub(super) fn lift_image( out } -/// Pick the canonical [`EntityKind`] for a text candidate. +/// Pick the canonical label name for a text candidate. /// -/// Priority order: (1) the model's typed kind, after label-map + -/// ignore-list filtering; (2) literal-value lookup in the label map -/// (covers raw-string-label backends); (3) drop. -fn resolve_text_kind( - typed: Option, +/// Priority order: (1) the model's typed label, after label-map + +/// ignore-list filtering; (2) literal-value lookup in the label +/// map (covers raw-string-label backends); (3) drop. +fn resolve_text_label( + typed: Option<&str>, value: &str, label_map: &LabelMap, labels_to_ignore: &[String], -) -> Option { - if let Some(kind) = typed { - let s = kind.to_string(); - if labels_to_ignore.iter().any(|l| l == &s) { +) -> Option { + if let Some(model_label) = typed { + if labels_to_ignore.iter().any(|l| l == model_label) { return None; } - return Some(label_map.lookup(&s).unwrap_or(kind)); + return Some( + label_map + .lookup(model_label) + .cloned() + .unwrap_or_else(|| EntityLabelRef::from(model_label.to_owned())), + ); } if labels_to_ignore.iter().any(|l| l == value) { return None; } - label_map.lookup(value) + label_map.lookup(value).cloned() } diff --git a/crates/nvisy-llm/src/recognition/text_prompt.rs b/crates/nvisy-llm/src/recognition/text_prompt.rs index 43bc3ac3..591fac41 100644 --- a/crates/nvisy-llm/src/recognition/text_prompt.rs +++ b/crates/nvisy-llm/src/recognition/text_prompt.rs @@ -57,8 +57,9 @@ impl<'a> TextPromptBuilder<'a> { let snippet = snippet_around(self.text, h.location.start, h.location.end); let name = h.name.as_deref().unwrap_or(""); let kind = h - .entity_kind - .map(|k| k.to_string()) + .label + .as_ref() + .map(|l| l.to_string()) .unwrap_or_else(|| "unknown".to_string()); prompt.push_str(&format!( "\n[hint {i}] name=\"{name}\", kind={kind}, \ diff --git a/crates/nvisy-llm/src/recognition/vlm_prompt.rs b/crates/nvisy-llm/src/recognition/vlm_prompt.rs index 453150c8..7e562bdc 100644 --- a/crates/nvisy-llm/src/recognition/vlm_prompt.rs +++ b/crates/nvisy-llm/src/recognition/vlm_prompt.rs @@ -42,8 +42,9 @@ impl<'a> VlmPromptBuilder<'a> { for (i, h) in self.hints.iter().enumerate() { let bbox = &h.location.bounding_box; let kind = h - .entity_kind - .map(|k| k.to_string()) + .label + .as_ref() + .map(|l| l.to_string()) .unwrap_or_else(|| "unknown".to_string()); let name = h.name.as_deref().unwrap_or(""); prompt.push_str(&format!( diff --git a/crates/nvisy-llm/tests/file_prompt.rs b/crates/nvisy-llm/tests/file_prompt.rs index acf8cdad..2d5a99cb 100644 --- a/crates/nvisy-llm/tests/file_prompt.rs +++ b/crates/nvisy-llm/tests/file_prompt.rs @@ -9,7 +9,7 @@ //! rendering for text, bbox access for image), plus the //! `label_map` / `labels_to_ignore` policy on the lift side. -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::{EntityLabelRef, builtins}; use nvisy_core::modality::{ImageData, ImageLocation, TextData, TextLocation}; use nvisy_core::primitive::{BoundingBox, Dimensions}; use nvisy_core::recognition::{Hint, RecognizerInput}; @@ -30,7 +30,7 @@ fn text_prompt_renders_template_and_lifts_entities() { let hint = Hint::::new(TextLocation::new(alice_start, alice_end)) .with_name("uploader-alice") - .with_entity_kind(EntityKind::PersonName); + .with_label(builtins::PERSON_NAME.label_ref()); let input = RecognizerInput::new(TextData::new(body)) .with_hints(vec![hint]) @@ -58,10 +58,10 @@ fn text_prompt_renders_template_and_lifts_entities() { ); // -- lift(): the TOML maps `person_name → date_of_birth` and - // ignores `diagnosis`. The model emits typed snake_case kinds - // (TextCandidate.entity_type is `Option`); we expect - // PersonName → DateOfBirth via the map, EmailAddress untouched, - // and Diagnosis dropped by the ignore list. + // ignores `diagnosis`. The model emits snake_case label names + // (TextCandidate.entity_type is `Option`); we expect + // person_name → date_of_birth via the map, email_address + // untouched, and diagnosis dropped by the ignore list. let response = LlmResponse::new( r#"{"entities":[ {"entity_type":"person_name","value":"Alice Carter","context":"From: Alice Carter <","confidence":0.9}, @@ -71,17 +71,17 @@ fn text_prompt_renders_template_and_lifts_entities() { ); let entities = prompt.lift(&response, &input); - let kinds: Vec = entities.iter().map(|e| e.entity_kind).collect(); + let kinds: Vec = entities.iter().map(|e| e.label.clone()).collect(); assert!( - kinds.contains(&EntityKind::DateOfBirth), + kinds.contains(&builtins::DATE_OF_BIRTH.label_ref()), "person_name should have been remapped to DateOfBirth via label_map: {kinds:?}", ); assert!( - kinds.contains(&EntityKind::EmailAddress), + kinds.contains(&builtins::EMAIL_ADDRESS.label_ref()), "email_address (no map entry) should pass through: {kinds:?}", ); assert!( - !kinds.contains(&EntityKind::Diagnosis), + !kinds.contains(&builtins::DIAGNOSIS.label_ref()), "diagnosis was in labels_to_ignore but appeared: {kinds:?}", ); assert_eq!( @@ -105,7 +105,7 @@ fn image_prompt_renders_template_and_lifts_entities() { 10.0, 20.0, 100.0, 50.0, ))) .with_name("uploader-face") - .with_entity_kind(EntityKind::PersonName); + .with_label(builtins::PERSON_NAME.label_ref()); let input = RecognizerInput::new(ImageData::new(bytes.clone(), dims)) .with_hints(vec![hint]) @@ -132,24 +132,24 @@ fn image_prompt_renders_template_and_lifts_entities() { // kinds; assert remap + ignore both fire. let response = LlmResponse::new( r#"{"entities":[ - {"entity_kind":"person_name","x":0.1,"y":0.1,"width":0.2,"height":0.2,"confidence":0.85}, - {"entity_kind":"license_plate","x":0.5,"y":0.5,"width":0.1,"height":0.05,"confidence":0.7}, - {"entity_kind":"url","x":0.0,"y":0.0,"width":0.05,"height":0.05,"confidence":0.9} + {"label":"person_name","x":0.1,"y":0.1,"width":0.2,"height":0.2,"confidence":0.85}, + {"label":"license_plate","x":0.5,"y":0.5,"width":0.1,"height":0.05,"confidence":0.7}, + {"label":"url","x":0.0,"y":0.0,"width":0.05,"height":0.05,"confidence":0.9} ]}"#, ); let entities = prompt.lift(&response, &input); - let kinds: Vec = entities.iter().map(|e| e.entity_kind).collect(); + let kinds: Vec = entities.iter().map(|e| e.label.clone()).collect(); assert!( - kinds.contains(&EntityKind::DateOfBirth), + kinds.contains(&builtins::DATE_OF_BIRTH.label_ref()), "person_name should have been remapped to DateOfBirth via label_map: {kinds:?}", ); assert!( - kinds.contains(&EntityKind::LicensePlate), + kinds.contains(&builtins::LICENSE_PLATE.label_ref()), "license_plate (no map entry) should pass through: {kinds:?}", ); assert!( - !kinds.contains(&EntityKind::Url), + !kinds.contains(&builtins::URL.label_ref()), "url was in labels_to_ignore but appeared: {kinds:?}", ); assert_eq!( diff --git a/crates/nvisy-ner/src/backend/bento_backend.rs b/crates/nvisy-ner/src/backend/bento_backend.rs index 3acf7b86..e84014af 100644 --- a/crates/nvisy-ner/src/backend/bento_backend.rs +++ b/crates/nvisy-ner/src/backend/bento_backend.rs @@ -5,20 +5,18 @@ //! `nvisy_core.ner.v1` from [`nvisycom/inference`]; per-request //! `correlation_id` propagation rides on the `x-request-id` //! header. Today the service is zero-shot — it requires a per-call -//! `kinds` list — so the backend errors out when called with -//! `kinds = None`. +//! `labels` list — so the backend errors out when called with +//! `labels = None`. //! -//! Wire compatibility note: the inference service today returns -//! `kind: EntityKind` (already normalized server-side); this -//! backend serialises that back to a string label so the -//! recognizer's [`LabelMap`] sees a uniform raw label regardless -//! of which backend produced it. +//! Wire compatibility note: the inference service returns the +//! service-side label string verbatim; the recognizer's +//! [`LabelMap`] re-canonicalises it into a workspace label name. //! //! [`nvisycom/inference`]: https://github.com/nvisycom/inference -//! [`LabelMap`]: crate::LabelMap +//! [`LabelMap`]: nvisy_core::recognition::LabelMap use bentoml::prelude::*; -use nvisy_core::entity::{EntityKind, ModelProvenance}; +use nvisy_core::entity::ModelProvenance; use nvisy_core::{Error, Result}; use uuid::Uuid; @@ -78,20 +76,20 @@ impl NerBackend for BentoBackend { #[tracing::instrument(skip_all)] async fn recognize(&self, request: NerRequest<'_>) -> Result { - let Some(kinds) = request.kinds else { + let Some(labels) = request.labels else { return Err(Error::validation( - "BentoBackend requires per-call kinds (the inference-gliner service is zero-shot)", + "BentoBackend requires per-call labels (the inference-gliner service is zero-shot)", COMPONENT, )); }; - if kinds.is_empty() { + if labels.is_empty() { return Ok(NerResponse::default()); } let language = request.language.map(|l| l.as_str().to_owned()); let wire_request = WireRequest { text: request.text.to_owned(), - kinds: kinds.to_vec(), + labels: labels.iter().map(|s| (*s).to_owned()).collect(), threshold: 0.0, language, }; @@ -114,7 +112,7 @@ impl NerBackend for BentoBackend { for response in &responses { for entity in &response.entities { spans.push(RawNerSpan::new( - entity_kind_label(entity.kind), + entity.label.clone(), entity.score, entity.start..entity.end, )); @@ -123,14 +121,3 @@ impl NerBackend for BentoBackend { Ok(NerResponse::new(spans)) } } - -/// Serialise an [`EntityKind`] to its -/// canonical snake_case label. The Bento service returns normalized -/// kinds; we round-trip through the wire label so [`LabelMap`] sees -/// a uniform raw label across backends. -/// -/// [`EntityKind`]: nvisy_core::entity::EntityKind -/// [`LabelMap`]: crate::LabelMap -fn entity_kind_label(kind: EntityKind) -> String { - kind.to_string() -} diff --git a/crates/nvisy-ner/src/backend/bento_types.rs b/crates/nvisy-ner/src/backend/bento_types.rs index 1d507201..0aa1a2ed 100644 --- a/crates/nvisy-ner/src/backend/bento_types.rs +++ b/crates/nvisy-ner/src/backend/bento_types.rs @@ -7,7 +7,6 @@ //! [`BentoBackend`]: super::BentoBackend //! [`nvisycom/inference`]: https://github.com/nvisycom/inference -use nvisy_core::entity::EntityKind; use serde::{Deserialize, Serialize}; /// Outer batch wrapper. Single- and multi-text calls share the @@ -26,10 +25,10 @@ pub(super) struct WireBatch { pub(super) struct WireRequest { /// The text to recognise entities in. pub text: String, - /// Entity kinds the caller is interested in. GLiNER is + /// Entity label names the caller is interested in. GLiNER is /// zero-shot — sending an empty list is meaningless and the /// runtime short-circuits before making the call. - pub kinds: Vec, + pub labels: Vec, /// Lower bound on per-entity score. The runtime keeps this at /// `0.0` and post-filters locally so threshold decisions stay /// in one place (the engine-side detection driver). @@ -51,9 +50,11 @@ pub(super) struct WireResponse { /// batch. #[allow(dead_code)] pub model: String, - /// Recognised entities, already classified into the canonical - /// [`EntityKind`] taxonomy by the service. Defaults to empty - /// when the service omits the field. + /// Recognised entities. Each entity's label is the + /// service-side classification, returned as a string so the + /// recognizer's [`LabelMap`] can re-canonicalise it. + /// + /// [`LabelMap`]: nvisy_core::recognition::LabelMap #[serde(default)] pub entities: Vec, } @@ -62,8 +63,10 @@ pub(super) struct WireResponse { #[derive(Deserialize)] #[serde(rename_all = "camelCase")] pub(super) struct WireEntity { - /// Canonical entity kind the service classified this span as. - pub kind: EntityKind, + /// Service-side classification of this span. Translated to + /// the workspace label vocabulary by the recognizer's + /// `LabelMap`. + pub label: String, /// Raw model score in `[0.0, 1.0]`. pub score: f64, /// Byte offset of the entity's start within diff --git a/crates/nvisy-ner/src/backend/mod.rs b/crates/nvisy-ner/src/backend/mod.rs index 155c04c6..bc27c31b 100644 --- a/crates/nvisy-ner/src/backend/mod.rs +++ b/crates/nvisy-ner/src/backend/mod.rs @@ -1,12 +1,11 @@ //! Backend layer: the [`NerBackend`] trait and its shipped impls. //! -//! One trait covers zero-shot backends (per-call kinds via -//! [`NerRequest::kinds = Some(...)`]) and fixed-label backends (kinds -//! baked into the model, `kinds = None`). Built-in [`NoopBackend`] -//! (returns no spans; test stub) and feature-gated [`BentoBackend`] -//! (HTTP call into the externalised `inference-gliner` service). -//! -//! [`NerRequest::kinds = Some(...)`]: NerRequest::kinds +//! One trait covers zero-shot backends (per-call labels via +//! [`NerRequest::labels = Some(...)`][NerRequest::labels]) and +//! fixed-label backends (labels baked into the model, +//! `labels = None`). Built-in [`NoopBackend`] (returns no spans; +//! test stub) and feature-gated [`BentoBackend`] (HTTP call into the +//! externalised `inference-gliner` service). mod ner_backend; mod ner_span; diff --git a/crates/nvisy-ner/src/backend/ner_backend.rs b/crates/nvisy-ner/src/backend/ner_backend.rs index 4c66ee2f..1a528921 100644 --- a/crates/nvisy-ner/src/backend/ner_backend.rs +++ b/crates/nvisy-ner/src/backend/ner_backend.rs @@ -2,11 +2,11 @@ //! //! Replaces the previous split between `GlinerBackend` (zero-shot, //! takes per-call kinds) and `NlpEngine`-produced NER spans -//! (fixed-label, no per-call kinds). The `kinds` field on -//! [`NerRequest`] is `Option<&[EntityKind]>`: `Some(...)` for -//! zero-shot backends that take a kind allowlist per call, -//! `None` for fixed-label backends whose set of kinds is baked -//! into the model. +//! (fixed-label, no per-call kinds). The `labels` field on +//! [`NerRequest`] is `Option<&[&str]>`: `Some(...)` for zero-shot +//! backends that take a label allowlist per call, `None` for +//! fixed-label backends whose set of labels is baked into the +//! model. //! //! Engines are called from inside [`NerRecognizer::recognize`] — no //! shared NLP pass, no orchestrator plumbing. Each recognizer holds @@ -15,7 +15,7 @@ //! [`NerRecognizer::recognize`]: crate::NerRecognizer use nvisy_core::Result; -use nvisy_core::entity::{EntityKind, ModelProvenance}; +use nvisy_core::entity::ModelProvenance; use nvisy_core::primitive::LanguageTag; use uuid::Uuid; @@ -27,12 +27,12 @@ pub struct NerRequest<'a> { /// Source text to scan. Byte offsets in returned spans refer /// back into this string. pub text: &'a str, - /// Kinds to detect when the backend supports per-call kind - /// selection. `None` means the backend uses its built-in + /// Label names to detect when the backend supports per-call + /// label selection. `None` means the backend uses its built-in /// fixed label set; `Some(slice)` means restrict detection to - /// the listed kinds. Empty slice short-circuits the call to - /// no work in the caller. - pub kinds: Option<&'a [EntityKind]>, + /// the listed names. Empty slice short-circuits the call to no + /// work in the caller. + pub labels: Option<&'a [&'a str]>, /// Caller-asserted language. Backends that support per-call /// language hinting use this; backends that don't ignore it. pub language: Option<&'a LanguageTag>, diff --git a/crates/nvisy-ner/src/backend/ner_span.rs b/crates/nvisy-ner/src/backend/ner_span.rs index 6c6a7d32..5eb6a0a7 100644 --- a/crates/nvisy-ner/src/backend/ner_span.rs +++ b/crates/nvisy-ner/src/backend/ner_span.rs @@ -18,8 +18,9 @@ use std::ops::Range; /// One raw entity span predicted by a NER model. /// /// Pre-normalization: the label is the model's string, not a -/// translated `EntityKind`. Coordinate space is byte offsets into the -/// source text the backend was called with. +/// canonical [`EntityLabelRef`][nvisy_core::entity::EntityLabelRef]. +/// Coordinate space is byte offsets into the source text the backend +/// was called with. #[derive(Debug, Clone, PartialEq)] pub struct RawNerSpan { /// Model-emitted label, verbatim. diff --git a/crates/nvisy-ner/src/recognition/config.rs b/crates/nvisy-ner/src/recognition/config.rs index 8feecda4..f96b5888 100644 --- a/crates/nvisy-ner/src/recognition/config.rs +++ b/crates/nvisy-ner/src/recognition/config.rs @@ -15,7 +15,6 @@ use std::collections::HashSet; use derive_builder::Builder; -use nvisy_core::entity::EntityKind; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::LabelMap; @@ -30,9 +29,8 @@ use super::aggregation::{AggregationStrategy, AlignmentMode}; build_fn(skip) )] pub struct NerModel { - /// Translation from raw model labels to canonical - /// [`EntityKind`] values. Defaults to - /// [`LabelMap::canonical`]. + /// Translation from raw model labels to canonical entity + /// label names. Defaults to [`LabelMap::canonical`]. pub label_map: LabelMap, /// Raw labels the adapter drops without translation. Useful /// for filtering out labels the model emits but we don't care @@ -43,11 +41,11 @@ pub struct NerModel { /// `[0.0, 1.0]` (treated as a bug; clamped + this used as the /// safe default). pub default_score: Confidence, - /// Entity kinds whose emitted confidence is multiplied by - /// `low_score_multiplier` before being surfaced. Use for + /// Entity label names whose emitted confidence is multiplied + /// by `low_score_multiplier` before being surfaced. Use for /// noisy-but-high-recall labels. - pub low_score_kinds: HashSet, - /// Multiplier applied to `low_score_kinds`. Must be in + pub low_score_labels: HashSet, + /// Multiplier applied to `low_score_labels`. Must be in /// `[0.0, 1.0]`. pub low_score_multiplier: f64, /// Aggregation policy for backends that emit token-level @@ -75,7 +73,7 @@ impl Default for NerModel { label_map: LabelMap::canonical(), labels_to_ignore: HashSet::new(), default_score: Confidence::new(0.85).expect("0.85 in range"), - low_score_kinds: HashSet::new(), + low_score_labels: HashSet::new(), low_score_multiplier: 0.4, aggregation: AggregationStrategy::Max, alignment: AlignmentMode::Expand, @@ -104,7 +102,7 @@ impl NerModelBuilder { label_map: self.label_map.unwrap_or(defaults.label_map), labels_to_ignore: self.labels_to_ignore.unwrap_or(defaults.labels_to_ignore), default_score: self.default_score.unwrap_or(defaults.default_score), - low_score_kinds: self.low_score_kinds.unwrap_or(defaults.low_score_kinds), + low_score_labels: self.low_score_labels.unwrap_or(defaults.low_score_labels), low_score_multiplier: self .low_score_multiplier .unwrap_or(defaults.low_score_multiplier), diff --git a/crates/nvisy-ner/src/recognition/recognizer.rs b/crates/nvisy-ner/src/recognition/recognizer.rs index ee7c905f..210b464b 100644 --- a/crates/nvisy-ner/src/recognition/recognizer.rs +++ b/crates/nvisy-ner/src/recognition/recognizer.rs @@ -18,7 +18,7 @@ use std::sync::Arc; use derive_builder::Builder; -use nvisy_core::entity::{Entity, EntityKind, ModelProvenance, TrailProvenance, TrailStep}; +use nvisy_core::entity::{Entity, EntityLabelRef, ModelProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Text, TextLocation}; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; @@ -50,12 +50,12 @@ pub struct NerRecognizer { /// [`with_engine`]: NerRecognizerBuilder::with_engine #[builder(setter(custom))] engine: Arc, - /// Kinds the recognizer advertises. When non-empty, the + /// Labels the recognizer advertises. When non-empty, the /// recognizer asks the backend for only this subset on every /// call (zero-shot path). When empty, the backend is asked for /// whatever it natively produces (fixed-label path). #[builder(default)] - supported_kinds: Vec, + supported_labels: Vec, /// Normalization knobs applied to the backend's raw output /// before entities are emitted. #[builder(default)] @@ -79,10 +79,10 @@ impl NerRecognizer { &self.name } - /// Kinds this recognizer advertises. + /// Labels this recognizer advertises. #[must_use] - pub fn supported_kinds(&self) -> &[EntityKind] { - &self.supported_kinds + pub fn supported_labels(&self) -> &[EntityLabelRef] { + &self.supported_labels } /// Borrow the normalization config. @@ -91,20 +91,20 @@ impl NerRecognizer { &self.model } - fn build_entity(&self, span: &RawNerSpan, kind: EntityKind) -> Entity { + fn build_entity(&self, span: &RawNerSpan, label: EntityLabelRef) -> Entity { let raw_confidence = Confidence::try_clamped(span.score).unwrap_or(self.model.default_score); - let confidence = if self.model.low_score_kinds.contains(&kind) { + let confidence = if self.model.low_score_labels.contains(label.as_str()) { let demoted = raw_confidence.get() * self.model.low_score_multiplier; Confidence::try_clamped(demoted).unwrap_or(self.model.default_score) } else { raw_confidence }; let provenance = TrailProvenance::Model(ModelProvenance::new(self.name.clone())); - let reason = format!("recognizer `{}` identified {kind}", self.name); + let reason = format!("recognizer `{}` identified {label}", self.name); let step = TrailStep::recognition("ner", confidence, provenance, reason); Entity::builder() - .with_entity_kind(kind) + .with_label(label) .with_trail(vec![step]) .with_confidence(confidence) .with_location(TextLocation::new(span.offset.start, span.offset.end)) @@ -132,14 +132,19 @@ impl NerRecognizerBuilder { #[async_trait::async_trait] impl EntityRecognizer for NerRecognizer { async fn recognize(&self, input: &RecognizerInput) -> Result> { - let kinds = if self.supported_kinds.is_empty() { + let supported_borrowed: Vec<&str> = self + .supported_labels + .iter() + .map(EntityLabelRef::as_str) + .collect(); + let labels = if supported_borrowed.is_empty() { None } else { - Some(self.supported_kinds.as_slice()) + Some(supported_borrowed.as_slice()) }; let request = NerRequest { text: input.data.text.as_str(), - kinds, + labels, language: input.language.as_ref(), correlation_id: input.correlation_id, }; @@ -153,8 +158,12 @@ impl EntityRecognizer for NerRecognizer { self.model .label_map .lookup(&s.label) - .filter(|k| self.supported_kinds.is_empty() || self.supported_kinds.contains(k)) - .map(|k| self.build_entity(s, k)) + .filter(|name| { + self.supported_labels.is_empty() + || self.supported_labels.iter().any(|sl| sl == *name) + }) + .cloned() + .map(|name| self.build_entity(s, name)) }) .collect(); Ok(RecognizerOutput::new(entities)) @@ -163,6 +172,7 @@ impl EntityRecognizer for NerRecognizer { #[cfg(test)] mod tests { + use nvisy_core::entity::builtins; use nvisy_core::modality::TextData; use super::*; @@ -173,7 +183,10 @@ mod tests { let rec = NerRecognizer::builder() .with_name("test") .with_engine(NoopBackend) - .with_supported_kinds(vec![EntityKind::PersonName, EntityKind::EmailAddress]) + .with_supported_labels(vec![ + EntityLabelRef::from(builtins::PERSON_NAME.name.clone()), + EntityLabelRef::from(builtins::EMAIL_ADDRESS.name.clone()), + ]) .build() .expect("builder succeeds"); let input = RecognizerInput::new(TextData::new("Alice Smith")); @@ -182,7 +195,7 @@ mod tests { } #[tokio::test] - async fn empty_supported_kinds_passes_none_to_engine() { + async fn empty_supported_labels_passes_none_to_engine() { let rec = NerRecognizer::builder() .with_name("test") .with_engine(NoopBackend) diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml b/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml index ab9e76e7..2d73da4f 100644 --- a/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml +++ b/crates/nvisy-pattern/assets/dictionaries/finance/cryptocurrencies.toml @@ -1,3 +1,3 @@ name = "cryptocurrencies" -entity_kind = "currency" +label = "currency" score = 0.85 diff --git a/crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml b/crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml index 4d6f8a3a..4b25dda3 100644 --- a/crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml +++ b/crates/nvisy-pattern/assets/dictionaries/finance/currencies.toml @@ -1,3 +1,3 @@ name = "currencies" -entity_kind = "currency" +label = "currency" score = 0.85 diff --git a/crates/nvisy-pattern/assets/dictionaries/general/languages.toml b/crates/nvisy-pattern/assets/dictionaries/general/languages.toml index 426ac586..d356fef1 100644 --- a/crates/nvisy-pattern/assets/dictionaries/general/languages.toml +++ b/crates/nvisy-pattern/assets/dictionaries/general/languages.toml @@ -1,5 +1,5 @@ name = "languages" -entity_kind = "language" +label = "language" score = 0.85 # Per-CSV-column overrides: # column 0 = long-form names (`English`, `Spanish`, ...) — high diff --git a/crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml b/crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml index bb2c7730..001301aa 100644 --- a/crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml +++ b/crates/nvisy-pattern/assets/dictionaries/general/nationalities.toml @@ -1,3 +1,3 @@ name = "nationalities" -entity_kind = "nationality" +label = "nationality" score = 0.85 diff --git a/crates/nvisy-pattern/assets/dictionaries/general/religions.toml b/crates/nvisy-pattern/assets/dictionaries/general/religions.toml index 57100e66..eaf55b82 100644 --- a/crates/nvisy-pattern/assets/dictionaries/general/religions.toml +++ b/crates/nvisy-pattern/assets/dictionaries/general/religions.toml @@ -1,3 +1,3 @@ name = "religions" -entity_kind = "religion" +label = "religion" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/contact/email.toml b/crates/nvisy-pattern/assets/patterns/contact/email.toml index 30c4b25f..13e70bcf 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/email.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/email.toml @@ -1,4 +1,4 @@ name = "email" -entity_kind = "email_address" +label = "email_address" regex = "\\b[a-zA-Z0-9._%+\\-]+@[a-zA-Z0-9.\\-]+\\.[a-zA-Z]{2,}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/contact/phone.toml b/crates/nvisy-pattern/assets/patterns/contact/phone.toml index c4caa2e8..b2e1faf5 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/phone.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/phone.toml @@ -1,5 +1,5 @@ name = "phone" -entity_kind = "phone_number" +label = "phone_number" regex = "(?:\\+\\d{1,3}[\\s.\\-]?)?\\(?\\d{2,4}\\)?[\\s.\\-]?\\d{3,4}[\\s.\\-]?\\d{4}\\b" score = 0.8 validator = "phone" diff --git a/crates/nvisy-pattern/assets/patterns/contact/url.toml b/crates/nvisy-pattern/assets/patterns/contact/url.toml index fe7f538a..24c3c9a2 100644 --- a/crates/nvisy-pattern/assets/patterns/contact/url.toml +++ b/crates/nvisy-pattern/assets/patterns/contact/url.toml @@ -1,4 +1,4 @@ name = "url" -entity_kind = "url" +label = "url" regex = "\\bhttps?://[^\\s/$.?#][^\\s]*\\b" score = 0.9 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml index 8d36727c..2748a222 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/aws_key.toml @@ -1,4 +1,4 @@ name = "aws-key" -entity_kind = "api_key" +label = "api_key" regex = "\\bAKIA[0-9A-Z]{16}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml index 673e3fed..4c851fa1 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/generic_api_key.toml @@ -1,4 +1,4 @@ name = "generic-api-key" -entity_kind = "api_key" +label = "api_key" regex = "(?i)(?:api[_\\-]?key|api[_\\-]?secret|access[_\\-]?token|secret[_\\-]?key|bearer)\\s*[:=]\\s*[\"']?([a-zA-Z0-9_\\-]{20,})[\"']?" score = 0.7 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml b/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml index f25e9ea4..39c9bb1c 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/github_token.toml @@ -1,4 +1,4 @@ name = "github-token" -entity_kind = "auth_token" +label = "auth_token" regex = "\\bgh[pousr]_[a-zA-Z0-9]{36}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml index eb51a53e..cdaff752 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/private_key.toml @@ -1,4 +1,4 @@ name = "private-key" -entity_kind = "private_key" +label = "private_key" regex = "-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----" score = 0.98 diff --git a/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml b/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml index 7cd769a6..73437d3b 100644 --- a/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml +++ b/crates/nvisy-pattern/assets/patterns/credentials/stripe_key.toml @@ -1,4 +1,4 @@ name = "stripe-key" -entity_kind = "api_key" +label = "api_key" regex = "\\bsk_(live|test)_[a-zA-Z0-9]{24,}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml b/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml index 173375f5..23f78873 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/bitcoin_address.toml @@ -1,4 +1,4 @@ name = "bitcoin-address" -entity_kind = "crypto_address" +label = "crypto_address" regex = "\\b(?:bc1[a-z0-9]{25,39}|[13][a-km-zA-HJ-NP-Z1-9]{25,34})\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml index f197ad68..9d73cd20 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/credit_card.toml @@ -1,5 +1,5 @@ name = "credit-card" -entity_kind = "payment_card" +label = "payment_card" regex = "\\b(?:\\d[ \\-]*?){13,19}\\b" score = 0.85 validator = "luhn" diff --git a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml b/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml index 96d979cd..02fa0939 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/ethereum_address.toml @@ -1,4 +1,4 @@ name = "ethereum-address" -entity_kind = "crypto_address" +label = "crypto_address" regex = "\\b0x[0-9a-fA-F]{40}\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/finance/iban.toml b/crates/nvisy-pattern/assets/patterns/finance/iban.toml index dd16062a..364dff6c 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/iban.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/iban.toml @@ -1,5 +1,5 @@ name = "iban" -entity_kind = "iban" +label = "iban" regex = "\\b[A-Z]{2}\\d{2}\\s?[A-Z0-9]{4}\\s?(?:\\d{4}\\s?){2,7}\\d{1,4}\\b" score = 0.85 validator = "iban" diff --git a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml b/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml index 628249e3..39b5c508 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/swift_code.toml @@ -1,4 +1,4 @@ name = "swift-code" -entity_kind = "swift_code" +label = "swift_code" regex = "\\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?\\b" score = 0.7 diff --git a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml b/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml index 83523604..b6fadd82 100644 --- a/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml +++ b/crates/nvisy-pattern/assets/patterns/finance/us_bank_routing.toml @@ -1,4 +1,4 @@ name = "us-bank-routing" -entity_kind = "bank_routing" +label = "bank_routing" regex = "\\b(?:0[1-9]|[12]\\d|3[0-2])\\d{7}\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml b/crates/nvisy-pattern/assets/patterns/identity/ssn.toml index 67f1cd8a..17028ed5 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/ssn.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/ssn.toml @@ -1,5 +1,5 @@ name = "ssn" -entity_kind = "government_id" +label = "government_id" regex = "\\b(\\d{3})-(\\d{2})-(\\d{4})\\b" score = 0.9 validator = "ssn" diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml b/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml index 614471f5..0720e2b9 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_drivers_license.toml @@ -1,4 +1,4 @@ name = "us-drivers-license" -entity_kind = "drivers_license" +label = "drivers_license" regex = "\\b[A-Z]\\d{3}-\\d{4}-\\d{4}\\b" score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml b/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml index 52aa5c6b..48da58bd 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_passport.toml @@ -1,4 +1,4 @@ name = "us-passport" -entity_kind = "passport_number" +label = "passport_number" regex = "\\b[A-Z]\\d{8}\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml b/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml index f4aee1d8..adf40e1d 100644 --- a/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml +++ b/crates/nvisy-pattern/assets/patterns/identity/us_postal_code.toml @@ -1,4 +1,4 @@ name = "us-postal-code" -entity_kind = "postal_code" +label = "postal_code" regex = "\\b\\d{5}(?:-\\d{4})?\\b" score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml b/crates/nvisy-pattern/assets/patterns/network/ipv4.toml index e8005ca5..914c6b46 100644 --- a/crates/nvisy-pattern/assets/patterns/network/ipv4.toml +++ b/crates/nvisy-pattern/assets/patterns/network/ipv4.toml @@ -1,4 +1,4 @@ name = "ipv4" -entity_kind = "ip_address" +label = "ip_address" regex = "\\b(?:(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\.){3}(?:25[0-5]|2[0-4]\\d|[01]?\\d\\d?)\\b" score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml b/crates/nvisy-pattern/assets/patterns/network/ipv6.toml index b474c6ee..0107ad00 100644 --- a/crates/nvisy-pattern/assets/patterns/network/ipv6.toml +++ b/crates/nvisy-pattern/assets/patterns/network/ipv6.toml @@ -1,4 +1,4 @@ name = "ipv6" -entity_kind = "ip_address" +label = "ip_address" regex = "\\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\\b|(?:[0-9a-fA-F]{1,4}:){1,7}:|::(?:[0-9a-fA-F]{1,4}:){0,5}[0-9a-fA-F]{1,4}\\b" score = 0.75 diff --git a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml b/crates/nvisy-pattern/assets/patterns/network/mac_address.toml index fc25aa9a..2766fc31 100644 --- a/crates/nvisy-pattern/assets/patterns/network/mac_address.toml +++ b/crates/nvisy-pattern/assets/patterns/network/mac_address.toml @@ -1,4 +1,4 @@ name = "mac-address" -entity_kind = "mac_address" +label = "mac_address" regex = "\\b(?:[0-9A-Fa-f]{2}[:\\-]){5}[0-9A-Fa-f]{2}\\b" score = 0.85 diff --git a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml index 71811bde..8bafa63f 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/date_of_birth.toml @@ -1,5 +1,5 @@ name = "date_of_birth" -entity_kind = "date_of_birth" +label = "date_of_birth" regex = "\\b(?:(?:0[1-9]|1[0-2]|[1-9])[/\\-](?:0[1-9]|[12]\\d|3[01]|[1-9])[/\\-](?:19|20)\\d{2}|(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01]))\\b" score = 0.6 validator = "date" diff --git a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml index c086548e..10ed0de4 100644 --- a/crates/nvisy-pattern/assets/patterns/personal/datetime.toml +++ b/crates/nvisy-pattern/assets/patterns/personal/datetime.toml @@ -1,5 +1,5 @@ name = "datetime" -entity_kind = "date_time" +label = "date_time" regex = "\\b(?:19|20)\\d{2}[/\\-](?:0[1-9]|1[0-2])[/\\-](?:0[1-9]|[12]\\d|3[01])[T ](?:[01]\\d|2[0-3]):[0-5]\\d(?::[0-5]\\d)?(?:Z|[+\\-]\\d{2}:?\\d{2})?\\b" score = 0.7 diff --git a/crates/nvisy-pattern/src/recognition/dictionary.rs b/crates/nvisy-pattern/src/recognition/dictionary.rs index d4d81ea6..856b707f 100644 --- a/crates/nvisy-pattern/src/recognition/dictionary.rs +++ b/crates/nvisy-pattern/src/recognition/dictionary.rs @@ -25,7 +25,7 @@ use derive_builder::Builder; use nvisy_core::Error; use nvisy_core::context::Context; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -44,8 +44,8 @@ use super::terms::Terms; pub struct Dictionary { /// Human-readable identifier (e.g. `"nationalities"`). pub name: String, - /// Entity kind every match emits. - pub entity_kind: EntityKind, + /// Entity label every match emits. + pub label: EntityLabelRef, /// Literal terms to scan for. The recognizer compiles these into /// an Aho-Corasick automaton at build time. pub terms: Terms, @@ -97,7 +97,7 @@ fn default_word_boundary() -> bool { impl Dictionary { /// Start a chainable builder. Required fields: `name`, - /// `entity_kind`, `terms`. + /// `label`, `terms`. #[must_use] pub fn builder() -> DictionaryBuilder { DictionaryBuilder::default() @@ -142,7 +142,7 @@ impl Dictionary { })?; let mut builder = Dictionary::builder() .with_name(metadata.name) - .with_entity_kind(metadata.entity_kind); + .with_label(metadata.label); if let Some(score) = metadata.score { builder = builder.with_score(score); } @@ -164,7 +164,7 @@ impl Dictionary { #[derive(Debug, Clone, Serialize, Deserialize)] struct DictionaryMetadata { name: String, - entity_kind: EntityKind, + label: EntityLabelRef, #[serde(default)] score: Option, #[serde(default)] diff --git a/crates/nvisy-pattern/src/recognition/recognizer.rs b/crates/nvisy-pattern/src/recognition/recognizer.rs index 93a58dd7..42fee424 100644 --- a/crates/nvisy-pattern/src/recognition/recognizer.rs +++ b/crates/nvisy-pattern/src/recognition/recognizer.rs @@ -11,7 +11,7 @@ use std::sync::Arc; use aho_corasick::{AhoCorasick, MatchKind}; -use nvisy_core::entity::{Entity, EntityKind, PatternProvenance, TrailProvenance, TrailStep}; +use nvisy_core::entity::{Entity, EntityLabelRef, PatternProvenance, TrailProvenance, TrailStep}; use nvisy_core::modality::{Text, TextLocation}; use nvisy_core::primitive::{Confidence, LanguageTag}; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput, RecognizerOutput}; @@ -31,7 +31,7 @@ use crate::validators::{Validator, ValidatorRegistry}; /// [`ContextEnhancer`]: crate::ContextEnhancer struct CompiledPattern { name: String, - entity_kind: EntityKind, + label: EntityLabelRef, regex: Regex, raw_regex: String, score: Confidence, @@ -45,7 +45,7 @@ struct CompiledPattern { /// emission metadata. struct CompiledDictionary { name: String, - entity_kind: EntityKind, + label: EntityLabelRef, /// First term-id (inclusive) for this dictionary inside the /// shared automaton. term_start: usize, @@ -145,7 +145,7 @@ impl PatternRecognizerBuilder { regex_sources.push(pattern.regex.clone()); compiled_patterns.push(CompiledPattern { name: pattern.name.clone(), - entity_kind: pattern.entity_kind, + label: pattern.label.clone(), regex, raw_regex: pattern.regex.clone(), score: pattern.score, @@ -181,7 +181,7 @@ impl PatternRecognizerBuilder { let term_end = all_terms.len(); compiled_dicts.push(CompiledDictionary { name: dict.name.clone(), - entity_kind: dict.entity_kind, + label: dict.label.clone(), term_start, term_end, term_scores, @@ -288,7 +288,7 @@ fn build_pattern_entity(pat: &CompiledPattern, start: usize, end: usize) -> Enti format!("pattern `{}` matched", pat.name), ); Entity::builder() - .with_entity_kind(pat.entity_kind) + .with_label(pat.label.clone()) .with_trail(vec![step]) .with_confidence(pat.score) .with_location(TextLocation::new(start, end)) @@ -331,7 +331,7 @@ fn build_dictionary_entity( format!("dictionary `{}` matched", dict.name), ); Entity::builder() - .with_entity_kind(dict.entity_kind) + .with_label(dict.label.clone()) .with_trail(vec![step]) .with_confidence(score) .with_location(TextLocation::new(start, end)) @@ -341,7 +341,7 @@ fn build_dictionary_entity( #[cfg(test)] mod tests { - use nvisy_core::entity::EntityKind; + use nvisy_core::entity::builtins; use nvisy_core::modality::TextData; use nvisy_core::recognition::RecognizerInput; @@ -353,7 +353,7 @@ mod tests { fn dict(name: &str, terms: &[&str], word_boundary: bool) -> Dictionary { Dictionary::builder() .with_name(name.to_owned()) - .with_entity_kind(EntityKind::Language) + .with_label(EntityLabelRef::from(builtins::LANGUAGE.name.clone())) .with_terms(Terms::from(terms)) .with_word_boundary(word_boundary) .build() diff --git a/crates/nvisy-pattern/src/recognition/regex_rule.rs b/crates/nvisy-pattern/src/recognition/regex_rule.rs index 55fb348b..5cfec944 100644 --- a/crates/nvisy-pattern/src/recognition/regex_rule.rs +++ b/crates/nvisy-pattern/src/recognition/regex_rule.rs @@ -12,7 +12,7 @@ use derive_builder::Builder; use nvisy_core::Error; use nvisy_core::context::Context; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::{Confidence, LanguageTag}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -34,8 +34,8 @@ pub struct Regex { /// Surfaced in trail steps so downstream consumers can see /// which rule matched. pub name: String, - /// Entity kind every match emits. - pub entity_kind: EntityKind, + /// Entity label every match emits. + pub label: EntityLabelRef, /// Regex source. Compiled to a [`regex::Regex`] by /// [`PatternRecognizer::build`]; shape /// errors there, not here. @@ -70,7 +70,7 @@ pub struct Regex { impl Regex { /// Start a chainable builder. Required fields: `name`, - /// `entity_kind`, `regex`. + /// `label`, `regex`. #[must_use] pub fn builder() -> RegexBuilder { RegexBuilder::default() diff --git a/crates/nvisy-pattern/testdata/dictionaries/product_codes.toml b/crates/nvisy-pattern/testdata/dictionaries/product_codes.toml index 5cd87450..50c65d25 100644 --- a/crates/nvisy-pattern/testdata/dictionaries/product_codes.toml +++ b/crates/nvisy-pattern/testdata/dictionaries/product_codes.toml @@ -1,3 +1,3 @@ name = "product_codes" -entity_kind = "internal_id" +label = "internal_id" score = 0.8 diff --git a/crates/nvisy-pattern/testdata/patterns/employee_id.toml b/crates/nvisy-pattern/testdata/patterns/employee_id.toml index 8a2d1e04..19949064 100644 --- a/crates/nvisy-pattern/testdata/patterns/employee_id.toml +++ b/crates/nvisy-pattern/testdata/patterns/employee_id.toml @@ -1,4 +1,4 @@ name = "internal-employee-id" -entity_kind = "internal_id" +label = "internal_id" regex = "\\bEMP-\\d{5}\\b" score = 0.95 diff --git a/crates/nvisy-pattern/testdata/patterns/product_codes.toml b/crates/nvisy-pattern/testdata/patterns/product_codes.toml index 4654a1fb..29f74865 100644 --- a/crates/nvisy-pattern/testdata/patterns/product_codes.toml +++ b/crates/nvisy-pattern/testdata/patterns/product_codes.toml @@ -1,4 +1,4 @@ name = "internal-product-code" -entity_kind = "internal_id" +label = "internal_id" regex = "\\b(?:WIDGET-\\d{3}|SPROCKET-\\d{2}|GADGET-X\\d)\\b" score = 0.9 diff --git a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs index ff36a639..6bb637f6 100644 --- a/crates/nvisy-pattern/tests/enhancer_roundtrip.rs +++ b/crates/nvisy-pattern/tests/enhancer_roundtrip.rs @@ -7,7 +7,7 @@ //! [`Refinement`]: nvisy_core::entity::TrailStepKind::Refinement use nvisy_core::context::{Context, ContextEnhancer}; -use nvisy_core::entity::{EntityKind, PatternProvenance, TrailProvenance, TrailStepKind}; +use nvisy_core::entity::{PatternProvenance, TrailProvenance, TrailStepKind, builtins}; use nvisy_core::modality::TextData; use nvisy_core::primitive::Confidence; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; @@ -17,7 +17,7 @@ use nvisy_pattern::{PatternRecognizer, PatternRegistry, Regex}; async fn enhancer_boosts_matches_near_keyword_only() { let ssn = Regex::builder() .with_name("ssn") - .with_entity_kind(EntityKind::GovernmentId) + .with_label(builtins::GOVERNMENT_ID.label_ref()) .with_regex(r"\b\d{3}-\d{2}-\d{4}\b") .with_score(Confidence::clamped(0.6)) .with_context(Context::new(["ssn", "social security"])) diff --git a/crates/nvisy-pattern/tests/shipped_detection.rs b/crates/nvisy-pattern/tests/shipped_detection.rs index f0849542..c6374987 100644 --- a/crates/nvisy-pattern/tests/shipped_detection.rs +++ b/crates/nvisy-pattern/tests/shipped_detection.rs @@ -7,7 +7,7 @@ //! so the fixtures and shipped regexes can both evolve without //! brittle byte-position churn. -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::{Text, TextData}; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; use nvisy_pattern::{PatternRecognizer, PatternRegistry}; @@ -30,16 +30,16 @@ async fn scan(text: &str) -> (String, Vec>) { (text.to_owned(), entities) } -fn assert_match(text: &str, entities: &[Entity], kind: EntityKind, needle: &str) { +fn assert_match(text: &str, entities: &[Entity], label: EntityLabelRef, needle: &str) { let hit = entities .iter() - .any(|e| e.entity_kind == kind && &text[e.location.start..e.location.end] == needle); + .any(|e| e.label == label && &text[e.location.start..e.location.end] == needle); assert!( hit, - "expected `{needle}` as {kind:?}; got: {:?}", + "expected `{needle}` as {label:?}; got: {:?}", entities .iter() - .map(|e| (e.entity_kind, &text[e.location.start..e.location.end])) + .map(|e| (e.label.clone(), &text[e.location.start..e.location.end])) .collect::>() ); } @@ -50,19 +50,19 @@ async fn contact_inputs_yield_expected_entities() { assert_match( &text, &entities, - EntityKind::EmailAddress, + builtins::EMAIL_ADDRESS.label_ref(), "alice.johnson@example.com", ); assert_match( &text, &entities, - EntityKind::Url, + builtins::URL.label_ref(), "https://docs.example.com/proposal", ); assert_match( &text, &entities, - EntityKind::Url, + builtins::URL.label_ref(), "http://backup.example.org/proposal-v2", ); } @@ -70,8 +70,18 @@ async fn contact_inputs_yield_expected_entities() { #[tokio::test] async fn identity_inputs_yield_expected_entities() { let (text, entities) = scan(include_str!("../testdata/inputs/identity.txt")).await; - assert_match(&text, &entities, EntityKind::GovernmentId, "123-45-6789"); - assert_match(&text, &entities, EntityKind::DateOfBirth, "1985-03-14"); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "123-45-6789", + ); + assert_match( + &text, + &entities, + builtins::DATE_OF_BIRTH.label_ref(), + "1985-03-14", + ); } #[tokio::test] @@ -80,19 +90,19 @@ async fn finance_inputs_yield_expected_entities() { assert_match( &text, &entities, - EntityKind::PaymentCard, + builtins::PAYMENT_CARD.label_ref(), "4539 1488 0343 6467", ); assert_match( &text, &entities, - EntityKind::CryptoAddress, + builtins::CRYPTO_ADDRESS.label_ref(), "1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa", ); assert_match( &text, &entities, - EntityKind::CryptoAddress, + builtins::CRYPTO_ADDRESS.label_ref(), "0x742d35Cc6634C0532925a3b844Bc9e7595f6E842", ); // Currency and cryptocurrency dictionaries emit `Currency`; @@ -100,7 +110,7 @@ async fn finance_inputs_yield_expected_entities() { assert!( entities .iter() - .any(|e| matches!(e.entity_kind, EntityKind::Currency)), + .any(|e| e.label == builtins::CURRENCY.label_ref()), "expected at least one currency/crypto dictionary hit" ); } @@ -108,12 +118,17 @@ async fn finance_inputs_yield_expected_entities() { #[tokio::test] async fn credentials_inputs_yield_expected_entities() { let (text, entities) = scan(include_str!("../testdata/inputs/credentials.txt")).await; - assert_match(&text, &entities, EntityKind::ApiKey, "AKIAIOSFODNN7EXAMPLE"); + assert_match( + &text, + &entities, + builtins::API_KEY.label_ref(), + "AKIAIOSFODNN7EXAMPLE", + ); // Private-key pattern matches the BEGIN header. assert!( entities .iter() - .any(|e| e.entity_kind == EntityKind::PrivateKey), + .any(|e| e.label == builtins::PRIVATE_KEY.label_ref()), "expected at least one PrivateKey entity" ); } @@ -121,19 +136,34 @@ async fn credentials_inputs_yield_expected_entities() { #[tokio::test] async fn network_inputs_yield_expected_entities() { let (text, entities) = scan(include_str!("../testdata/inputs/network.txt")).await; - assert_match(&text, &entities, EntityKind::IpAddress, "192.168.1.42"); - assert_match(&text, &entities, EntityKind::IpAddress, "10.0.0.7"); - assert_match(&text, &entities, EntityKind::IpAddress, "203.0.113.55"); assert_match( &text, &entities, - EntityKind::IpAddress, + builtins::IP_ADDRESS.label_ref(), + "192.168.1.42", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "10.0.0.7", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), + "203.0.113.55", + ); + assert_match( + &text, + &entities, + builtins::IP_ADDRESS.label_ref(), "2001:0db8:85a3:0000:0000:8a2e:0370:7334", ); assert_match( &text, &entities, - EntityKind::MacAddress, + builtins::MAC_ADDRESS.label_ref(), "00:1A:2B:3C:4D:5E", ); } @@ -141,11 +171,16 @@ async fn network_inputs_yield_expected_entities() { #[tokio::test] async fn personal_inputs_yield_expected_entities() { let (text, entities) = scan(include_str!("../testdata/inputs/personal.txt")).await; - assert_match(&text, &entities, EntityKind::DateOfBirth, "04/22/1979"); assert_match( &text, &entities, - EntityKind::DateTime, + builtins::DATE_OF_BIRTH.label_ref(), + "04/22/1979", + ); + assert_match( + &text, + &entities, + builtins::DATE_TIME.label_ref(), "2024-06-15T09:30:00Z", ); // Nationality and language dictionaries pick up `Italian`, @@ -153,13 +188,13 @@ async fn personal_inputs_yield_expected_entities() { assert!( entities .iter() - .any(|e| e.entity_kind == EntityKind::Nationality), + .any(|e| e.label == builtins::NATIONALITY.label_ref()), "expected at least one Nationality" ); assert!( entities .iter() - .any(|e| e.entity_kind == EntityKind::Language), + .any(|e| e.label == builtins::LANGUAGE.label_ref()), "expected at least one Language" ); } diff --git a/crates/nvisy-pattern/tests/user_rules.rs b/crates/nvisy-pattern/tests/user_rules.rs index cd80d44b..6e78c8ac 100644 --- a/crates/nvisy-pattern/tests/user_rules.rs +++ b/crates/nvisy-pattern/tests/user_rules.rs @@ -5,7 +5,7 @@ //! confirm a real internal-handoff document yields the custom //! entities. -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use nvisy_core::modality::TextData; use nvisy_core::recognition::{EntityRecognizer, RecognizerInput}; use nvisy_pattern::{Dictionary, PatternRecognizer, PatternRegistry, Regex, Terms}; @@ -53,7 +53,7 @@ async fn user_toml_rules_load_and_detect() { // The custom regex finds both employee numbers. let emp_hits: Vec<&str> = entities .iter() - .filter(|e| e.entity_kind == EntityKind::InternalId) + .filter(|e| e.label == builtins::INTERNAL_ID.label_ref()) .map(|e| &text[e.location.start..e.location.end]) .collect(); assert!( @@ -86,7 +86,7 @@ async fn user_toml_rules_load_and_detect() { assert!( entities .iter() - .any(|e| e.entity_kind == EntityKind::EmailAddress + .any(|e| e.label == builtins::EMAIL_ADDRESS.label_ref() && &text[e.location.start..e.location.end] == "counsel@example.com"), "expected shipped email pattern to fire alongside user rules" ); diff --git a/crates/nvisy-toolkit/examples/pipeline.rs b/crates/nvisy-toolkit/examples/pipeline.rs index 2d8a593f..9a6a0656 100644 --- a/crates/nvisy-toolkit/examples/pipeline.rs +++ b/crates/nvisy-toolkit/examples/pipeline.rs @@ -21,7 +21,7 @@ use std::str::from_utf8; use nvisy_codec::{CodecRegistry, DocumentHandle}; use nvisy_core::Result; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use nvisy_core::modality::{Text, TextData}; use nvisy_core::primitive::ConfidenceThreshold; use nvisy_core::recognition::RecognizerInput; @@ -70,7 +70,7 @@ async fn main() -> Result<()> { let matched = &SAMPLE[entity.location.start..entity.location.end]; println!( " - {:?} {:?} at {}..{} (confidence {:.2})", - entity.entity_kind, + entity.label, matched, entity.location.start, entity.location.end, @@ -109,9 +109,15 @@ async fn main() -> Result<()> { // returns a `Redactions` batch; `redact_at` flushes the batch // back into the codec handler in place. let redaction = RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::PhoneNumber, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::PaymentCard, Mask::stars()) + .insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + Replace::new("[{label}]"), + ) + .insert_label( + builtins::PHONE_NUMBER.label_ref(), + Replace::new("[{label}]"), + ) + .insert_label(builtins::PAYMENT_CARD.label_ref(), Mask::stars()) .with_fallback(Redact); // `source` is the codec-backed handle; it satisfies `DataAt`, diff --git a/crates/nvisy-toolkit/src/deduplication/filter/mod.rs b/crates/nvisy-toolkit/src/deduplication/filter/mod.rs index 421264e1..044ab5a1 100644 --- a/crates/nvisy-toolkit/src/deduplication/filter/mod.rs +++ b/crates/nvisy-toolkit/src/deduplication/filter/mod.rs @@ -1,4 +1,4 @@ -//! [`FilterLayer`]: drop entities outside the allowed kinds or +//! [`FilterLayer`]: drop entities outside the allowed labels or //! below the confidence floor. //! //! Dropped entities are returned from [`Layer::apply`] so the @@ -6,26 +6,26 @@ //! //! [`Layer::apply`]: super::layer::Layer::apply -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef}; use nvisy_core::extraction::TextAt; use nvisy_core::modality::Modality; use nvisy_core::primitive::ConfidenceThreshold; use super::layer::{Layer, LayerContext}; -/// [`Layer`] that drops entities outside the allowed kinds or +/// [`Layer`] that drops entities outside the allowed labels or /// below the confidence floor. Returns the dropped entities from /// [`Layer::apply`]. /// /// Construct empty with [`FilterLayer::new`] (default = pass -/// everything) and configure via [`with_allowed_kinds`] / +/// everything) and configure via [`with_allowed_labels`] / /// [`with_confidence_threshold`]. /// -/// [`with_allowed_kinds`]: Self::with_allowed_kinds +/// [`with_allowed_labels`]: Self::with_allowed_labels /// [`with_confidence_threshold`]: Self::with_confidence_threshold #[derive(Debug, Clone, Default)] pub struct FilterLayer { - allowed_kinds: Option>, + allowed_labels: Option>, confidence_threshold: Option, } @@ -35,11 +35,11 @@ impl FilterLayer { Self::default() } - /// Drop entities whose `entity_kind` is outside this set. - /// `None` keeps every kind (same as not calling this). + /// Drop entities whose `label` is outside this set. + /// `None` keeps every label (same as not calling this). #[must_use] - pub fn with_allowed_kinds(mut self, kinds: Option>) -> Self { - self.allowed_kinds = kinds; + pub fn with_allowed_labels(mut self, labels: Option>) -> Self { + self.allowed_labels = labels; self } @@ -54,8 +54,8 @@ impl FilterLayer { /// Whether `entity` clears every configured filter knob. pub fn passes(&self, entity: &Entity) -> bool { - if let Some(ref kinds) = self.allowed_kinds - && !kinds.contains(&entity.entity_kind) + if let Some(ref labels) = self.allowed_labels + && !labels.contains(&entity.label) { return false; } @@ -89,16 +89,16 @@ impl + ?Sized> Layer for FilterLayer { #[cfg(test)] mod tests { - use nvisy_core::entity::{Entity, EntityKind}; + use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::Text; use nvisy_core::primitive::Confidence; use super::*; use crate::deduplication::test_resolver; - fn ent(kind: EntityKind, conf: f64) -> Entity { + fn ent(label: EntityLabelRef, conf: f64) -> Entity { Entity::test_builder(0, 4) - .with_entity_kind(kind) + .with_label(label) .with_confidence(Confidence::new(conf).expect("in range")) .test_build() } @@ -115,8 +115,8 @@ mod tests { #[tokio::test] async fn default_layer_keeps_everything() { let mut entities: Vec> = vec![ - ent(EntityKind::PersonName, 0.9), - ent(EntityKind::EmailAddress, 0.4), + ent(builtins::PERSON_NAME.label_ref(), 0.9), + ent(builtins::EMAIL_ADDRESS.label_ref(), 0.4), ]; let dropped = apply(FilterLayer::new(), &mut entities).await; assert_eq!(entities.len(), 2); @@ -124,24 +124,25 @@ mod tests { } #[tokio::test] - async fn allowed_kinds_drops_outsiders() { + async fn allowed_labels_drops_outsiders() { let mut entities: Vec> = vec![ - ent(EntityKind::PersonName, 0.9), - ent(EntityKind::EmailAddress, 0.9), + ent(builtins::PERSON_NAME.label_ref(), 0.9), + ent(builtins::EMAIL_ADDRESS.label_ref(), 0.9), ]; - let layer = FilterLayer::new().with_allowed_kinds(Some(vec![EntityKind::PersonName])); + let layer = + FilterLayer::new().with_allowed_labels(Some(vec![builtins::PERSON_NAME.label_ref()])); let dropped = apply(layer, &mut entities).await; assert_eq!(entities.len(), 1); - assert_eq!(entities[0].entity_kind, EntityKind::PersonName); + assert_eq!(entities[0].label, builtins::PERSON_NAME.label_ref()); assert_eq!(dropped.len(), 1); - assert_eq!(dropped[0].entity_kind, EntityKind::EmailAddress); + assert_eq!(dropped[0].label, builtins::EMAIL_ADDRESS.label_ref()); } #[tokio::test] async fn confidence_threshold_drops_below() { let mut entities: Vec> = vec![ - ent(EntityKind::PersonName, 0.95), - ent(EntityKind::PersonName, 0.40), + ent(builtins::PERSON_NAME.label_ref(), 0.95), + ent(builtins::PERSON_NAME.label_ref(), 0.40), ]; let layer = FilterLayer::new().with_confidence_threshold(Some(ConfidenceThreshold::clamped(0.5))); @@ -152,14 +153,14 @@ mod tests { } #[tokio::test] - async fn kinds_and_threshold_compose() { + async fn labels_and_threshold_compose() { let mut entities: Vec> = vec![ - ent(EntityKind::PersonName, 0.95), // keep - ent(EntityKind::PersonName, 0.40), // drop: threshold - ent(EntityKind::EmailAddress, 0.95), // drop: kind + ent(builtins::PERSON_NAME.label_ref(), 0.95), // keep + ent(builtins::PERSON_NAME.label_ref(), 0.40), // drop: threshold + ent(builtins::EMAIL_ADDRESS.label_ref(), 0.95), // drop: label ]; let layer = FilterLayer::new() - .with_allowed_kinds(Some(vec![EntityKind::PersonName])) + .with_allowed_labels(Some(vec![builtins::PERSON_NAME.label_ref()])) .with_confidence_threshold(Some(ConfidenceThreshold::clamped(0.5))); let dropped = apply(layer, &mut entities).await; assert_eq!(entities.len(), 1); diff --git a/crates/nvisy-toolkit/src/deduplication/fuse/group.rs b/crates/nvisy-toolkit/src/deduplication/fuse/group.rs index 6ab9397b..8d8e7f42 100644 --- a/crates/nvisy-toolkit/src/deduplication/fuse/group.rs +++ b/crates/nvisy-toolkit/src/deduplication/fuse/group.rs @@ -13,7 +13,7 @@ use std::collections::{HashMap, HashSet}; use std::mem; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef}; use nvisy_core::extraction::TextAt; use nvisy_core::modality::{Modality, Overlap}; use schemars::JsonSchema; @@ -118,10 +118,10 @@ where // Phase 2a: within each bucket, sub-group by location overlap. let mut groups: Vec>> = Vec::new(); - let mut kind_groups: HashMap> = HashMap::new(); + let mut label_groups: HashMap> = HashMap::new(); for (_key, bucket) in buckets { - let kind = bucket[0].entity_kind; + let label = bucket[0].label.clone(); let mut sub_groups: Vec>> = Vec::new(); for entity in bucket { @@ -140,7 +140,7 @@ where let idx = groups.len(); groups.push(sg); if is_substring { - kind_groups.entry(kind).or_default().push(idx); + label_groups.entry(label.clone()).or_default().push(idx); } } } @@ -149,7 +149,7 @@ where // kind whose values have a containment relationship. if is_substring { let mut total_merges = 0usize; - for indices in kind_groups.values() { + for indices in label_groups.values() { let mut merged_into: HashSet = HashSet::new(); for i in 0..indices.len() { if merged_into.contains(&indices[i]) { diff --git a/crates/nvisy-toolkit/src/deduplication/fuse/key.rs b/crates/nvisy-toolkit/src/deduplication/fuse/key.rs index dabe6549..7b45803b 100644 --- a/crates/nvisy-toolkit/src/deduplication/fuse/key.rs +++ b/crates/nvisy-toolkit/src/deduplication/fuse/key.rs @@ -1,6 +1,6 @@ //! [`GroupKey`]: hash key for the first grouping phase. -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef}; use nvisy_core::extraction::TextAt; use nvisy_core::modality::Modality; @@ -18,7 +18,7 @@ use super::group::GroupingCriteria; /// [`Normalized`]: GroupingCriteria::Normalized #[derive(Hash, PartialEq, Eq)] pub(super) struct GroupKey { - pub(super) kind: EntityKind, + pub(super) label: EntityLabelRef, pub(super) value: String, } @@ -36,7 +36,7 @@ impl GroupKey { None => entity.id.to_string(), }; Self { - kind: entity.entity_kind, + label: entity.label.clone(), value, } } diff --git a/crates/nvisy-toolkit/src/deduplication/params.rs b/crates/nvisy-toolkit/src/deduplication/params.rs index b3f6244d..cc8ca7f8 100644 --- a/crates/nvisy-toolkit/src/deduplication/params.rs +++ b/crates/nvisy-toolkit/src/deduplication/params.rs @@ -15,7 +15,7 @@ //! [`ConflictResolution`]: super::resolve::ConflictResolution //! [`LayerPipeline::from_params`]: super::pipeline::LayerPipeline::from_params -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::EntityLabelRef; use nvisy_core::primitive::ConfidenceThreshold; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; @@ -40,10 +40,10 @@ pub struct LayerParams { /// Per-recognizer confidence scaling applied first. #[serde(default, skip_serializing_if = "CalibrationMap::is_empty")] pub calibration: CalibrationMap, - /// Drop entities whose `entity_kind` is outside this set. `None` - /// keeps every kind. + /// Drop entities whose `label` is outside this set. `None` + /// keeps every label. #[serde(default, skip_serializing_if = "Option::is_none")] - pub allowed_kinds: Option>, + pub allowed_labels: Option>, /// Minimum calibrated confidence an entity must clear to survive /// the filter step. `None` keeps every confidence level. #[serde(default, skip_serializing_if = "Option::is_none")] diff --git a/crates/nvisy-toolkit/src/deduplication/pipeline.rs b/crates/nvisy-toolkit/src/deduplication/pipeline.rs index 734b4cd1..b431f9d3 100644 --- a/crates/nvisy-toolkit/src/deduplication/pipeline.rs +++ b/crates/nvisy-toolkit/src/deduplication/pipeline.rs @@ -96,7 +96,7 @@ where /// fuse → resolve. Every layer's config is read from `params`. pub fn from_params(params: &LayerParams) -> Self { let filter = FilterLayer::new() - .with_allowed_kinds(params.allowed_kinds.clone()) + .with_allowed_labels(params.allowed_labels.clone()) .with_confidence_threshold(params.confidence_threshold); Self::new() .with_layer(CalibrateLayer::new(params.calibration.clone())) diff --git a/crates/nvisy-toolkit/src/deduplication/resolve/mod.rs b/crates/nvisy-toolkit/src/deduplication/resolve/mod.rs index f41f6b6d..d4ed14ab 100644 --- a/crates/nvisy-toolkit/src/deduplication/resolve/mod.rs +++ b/crates/nvisy-toolkit/src/deduplication/resolve/mod.rs @@ -61,7 +61,7 @@ where if losers[j] { continue; } - if entities[i].entity_kind == entities[j].entity_kind { + if entities[i].label == entities[j].label { continue; } if !entities[i].location.overlaps(&entities[j].location) { @@ -102,7 +102,7 @@ where #[cfg(test)] mod tests { - use nvisy_core::entity::{Entity, EntityKind}; + use nvisy_core::entity::{Entity, builtins}; use nvisy_core::modality::Text; use nvisy_core::primitive::Confidence; @@ -128,26 +128,26 @@ mod tests { async fn highest_confidence_keeps_winner() { let mut entities: Vec> = vec![ Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::PhoneNumber) + .with_label(builtins::PHONE_NUMBER.label_ref()) .test_build(), Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_confidence(conf(0.8)) .test_build(), ]; let _ = apply(ConflictResolution::HighestConfidence, &mut entities).await; assert_eq!(entities.len(), 1); - assert_eq!(entities[0].entity_kind, EntityKind::PhoneNumber); + assert_eq!(entities[0].label, builtins::PHONE_NUMBER.label_ref()); } #[tokio::test] async fn non_overlapping_not_resolved() { let mut entities: Vec> = vec![ Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::PhoneNumber) + .with_label(builtins::PHONE_NUMBER.label_ref()) .test_build(), Entity::test_builder(20, 24) - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_confidence(conf(0.8)) .test_build(), ]; @@ -159,10 +159,10 @@ mod tests { async fn same_kind_not_resolved() { let mut entities: Vec> = vec![ Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::PhoneNumber) + .with_label(builtins::PHONE_NUMBER.label_ref()) .test_build(), Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::PhoneNumber) + .with_label(builtins::PHONE_NUMBER.label_ref()) .with_confidence(conf(0.8)) .test_build(), ]; @@ -174,15 +174,15 @@ mod tests { async fn longest_span_keeps_longer() { let mut entities: Vec> = vec![ Entity::test_builder(0, 3) - .with_entity_kind(EntityKind::PhoneNumber) + .with_label(builtins::PHONE_NUMBER.label_ref()) .test_build(), Entity::test_builder(0, 8) - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_confidence(conf(0.7)) .test_build(), ]; let _ = apply(ConflictResolution::LongestSpan, &mut entities).await; assert_eq!(entities.len(), 1); - assert_eq!(entities[0].entity_kind, EntityKind::EmailAddress); + assert_eq!(entities[0].label, builtins::EMAIL_ADDRESS.label_ref()); } } diff --git a/crates/nvisy-toolkit/src/detection/chunks.rs b/crates/nvisy-toolkit/src/detection/chunks.rs index e6fe862e..6ae8f07c 100644 --- a/crates/nvisy-toolkit/src/detection/chunks.rs +++ b/crates/nvisy-toolkit/src/detection/chunks.rs @@ -121,7 +121,7 @@ impl LiftedFromText for Tabular { fn from_text(text_entity: Entity, location: TabularLocation) -> Entity { let mut builder = Entity::::builder() .with_id(text_entity.id) - .with_entity_kind(text_entity.entity_kind) + .with_label(text_entity.label) .with_location(location) .with_confidence(text_entity.confidence) .with_trail(text_entity.trail); diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/encrypt.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/encrypt.rs index 6e3e8f33..5303ee7b 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/encrypt.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/encrypt.rs @@ -100,7 +100,7 @@ impl Anonymizer for Encrypt { #[cfg(test)] mod tests { - use nvisy_core::entity::{EntityKind, TrailStep}; + use nvisy_core::entity::{TrailStep, builtins}; use nvisy_core::modality::TextLocation; use nvisy_core::primitive::Confidence; @@ -114,7 +114,7 @@ mod tests { fn entity(start: usize, end: usize) -> Entity { Entity::builder() - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/hash.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/hash.rs index e6b9cf69..c4f56925 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/hash.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/hash.rs @@ -102,7 +102,7 @@ fn hex(bytes: &[u8]) -> String { #[cfg(test)] mod tests { - use nvisy_core::entity::{EntityKind, TrailStep}; + use nvisy_core::entity::{TrailStep, builtins}; use nvisy_core::modality::TextLocation; use nvisy_core::primitive::Confidence; @@ -110,7 +110,7 @@ mod tests { fn entity(start: usize, end: usize) -> Entity { Entity::builder() - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/keep.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/keep.rs index a376f673..5b74002b 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/keep.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/keep.rs @@ -1,7 +1,7 @@ //! [`Keep`]: pass the matched span through unchanged. //! //! Useful in mixed policies — e.g. mask every kind by default but -//! keep `EntityKind::Currency` so prices remain readable. The +//! keep `builtins::CURRENCY.label_ref()` so prices remain readable. The //! replacement records the original value verbatim so the audit //! trail still has a row. diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/mask.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/mask.rs index 09238d6f..3609933a 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/mask.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/mask.rs @@ -133,7 +133,7 @@ impl Anonymizer for Mask { #[cfg(test)] mod tests { - use nvisy_core::entity::{EntityKind, TrailStep}; + use nvisy_core::entity::{TrailStep, builtins}; use nvisy_core::modality::TextLocation; use nvisy_core::primitive::Confidence; @@ -141,7 +141,7 @@ mod tests { fn entity(start: usize, end: usize) -> Entity { Entity::builder() - .with_entity_kind(EntityKind::PaymentCard) + .with_label(builtins::PAYMENT_CARD.label_ref()) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/mod.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/mod.rs index 30d2d0a7..8fe59172 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/mod.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/mod.rs @@ -3,10 +3,10 @@ //! //! Each operator is a typed Rust struct; consumers construct it //! with the parameters they want and register the instance -//! against the [`EntityKind`]s it should run for. +//! against the [`EntityLabelRef`]s it should run for. //! //! [`Anonymizer`]: crate::redaction::Anonymizer -//! [`EntityKind`]: nvisy_core::entity::EntityKind +//! [`EntityLabelRef`]: nvisy_core::entity::EntityLabelRef #[cfg(feature = "encrypt")] #[cfg_attr(docsrs, doc(cfg(feature = "encrypt")))] diff --git a/crates/nvisy-toolkit/src/redaction/anonymizer/replace.rs b/crates/nvisy-toolkit/src/redaction/anonymizer/replace.rs index 2139624d..e968fd9c 100644 --- a/crates/nvisy-toolkit/src/redaction/anonymizer/replace.rs +++ b/crates/nvisy-toolkit/src/redaction/anonymizer/replace.rs @@ -4,11 +4,10 @@ //! Templates support two placeholders that are expanded at apply //! time: //! -//! - `{entity_kind}` — the entity's [`EntityKind`] in snake_case -//! (e.g. `person_name`). +//! - `{label}` — the entity's label name (e.g. `person_name`). //! - `{value}` — the original matched substring. //! -//! The default template is `[{entity_kind}]`. +//! The default template is `[{label}]`. use nvisy_core::Result; use nvisy_core::entity::Entity; @@ -33,10 +32,11 @@ impl Replace { } impl Default for Replace { - /// Default template is `[{entity_kind}]` so users who don't - /// configure a template still get a visible kind-tagged marker. + /// Default template is `[{label}]` so users who don't + /// configure a template still get a visible label-tagged + /// marker. fn default() -> Self { - Self::new("[{entity_kind}]") + Self::new("[{label}]") } } @@ -51,8 +51,7 @@ impl Anonymizer for Replace { async fn apply(&self, entity: &Entity, source: &TextData) -> Result { let value = source.text.as_str(); - let kind = entity.entity_kind.to_string(); - let rendered = render(&self.template, &kind, value); + let rendered = render(&self.template, entity.label.as_str(), value); Ok(TextReplacement::substituted(rendered)) } } @@ -69,29 +68,26 @@ impl Anonymizer for Replace { source: &TextData, ) -> Result { let value = source.text.as_str(); - let kind = entity.entity_kind.to_string(); - let rendered = render(&self.template, &kind, value); + let rendered = render(&self.template, entity.label.as_str(), value); Ok(TabularReplacement::substituted(rendered)) } } -fn render(template: &str, kind: &str, value: &str) -> String { - template - .replace("{entity_kind}", kind) - .replace("{value}", value) +fn render(template: &str, label: &str, value: &str) -> String { + template.replace("{label}", label).replace("{value}", value) } #[cfg(test)] mod tests { - use nvisy_core::entity::{EntityKind, TrailStep}; + use nvisy_core::entity::{EntityLabelRef, TrailStep, builtins}; use nvisy_core::modality::TextLocation; use nvisy_core::primitive::Confidence; use super::*; - fn entity(kind: EntityKind, start: usize, end: usize) -> Entity { + fn entity(label: EntityLabelRef, start: usize, end: usize) -> Entity { Entity::builder() - .with_entity_kind(kind) + .with_label(label) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) @@ -100,19 +96,19 @@ mod tests { } #[tokio::test] - async fn default_template_emits_bracketed_kind() { + async fn default_template_emits_bracketed_label() { let op = Replace::default(); let source = TextData::new("alice@example.test"); - let entity = entity(EntityKind::EmailAddress, 0, 18); + let entity = entity(builtins::EMAIL_ADDRESS.label_ref(), 0, 18); let out = op.apply(&entity, &source).await.unwrap(); assert_eq!(out, TextReplacement::substituted("[email_address]")); } #[tokio::test] async fn template_with_value_placeholder() { - let op = Replace::new("<<{value}::{entity_kind}>>"); + let op = Replace::new("<<{value}::{label}>>"); let source = TextData::new("alice"); - let entity = entity(EntityKind::EmailAddress, 0, source.text.len()); + let entity = entity(builtins::EMAIL_ADDRESS.label_ref(), 0, source.text.len()); let out = op.apply(&entity, &source).await.unwrap(); assert_eq!( out, @@ -124,7 +120,7 @@ mod tests { async fn empty_source_yields_empty_value_placeholder() { let op = Replace::new("[{value}]"); let source = TextData::new(""); - let entity = entity(EntityKind::PersonName, 0, 0); + let entity = entity(builtins::PERSON_NAME.label_ref(), 0, 0); let out = op.apply(&entity, &source).await.unwrap(); assert_eq!(out, TextReplacement::substituted("[]")); } diff --git a/crates/nvisy-toolkit/src/redaction/deanonymizer/decrypt.rs b/crates/nvisy-toolkit/src/redaction/deanonymizer/decrypt.rs index a9a3dc58..da6a3eae 100644 --- a/crates/nvisy-toolkit/src/redaction/deanonymizer/decrypt.rs +++ b/crates/nvisy-toolkit/src/redaction/deanonymizer/decrypt.rs @@ -105,7 +105,7 @@ impl Deanonymizer for Decrypt { #[cfg(test)] mod tests { - use nvisy_core::entity::{EntityKind, TrailStep}; + use nvisy_core::entity::{TrailStep, builtins}; use nvisy_core::modality::{TabularLocation, TextLocation}; use nvisy_core::primitive::Confidence; @@ -121,7 +121,7 @@ mod tests { fn text_entity(start: usize, end: usize) -> Entity { Entity::builder() - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) @@ -131,7 +131,7 @@ mod tests { fn tabular_entity(row: u32, col: u32) -> Entity { Entity::builder() - .with_entity_kind(EntityKind::EmailAddress) + .with_label(builtins::EMAIL_ADDRESS.label_ref()) .with_location(TabularLocation::new(row, col)) .with_confidence(Confidence::new(1.0).unwrap()) .with_trail(Vec::::new()) diff --git a/crates/nvisy-toolkit/src/redaction/registry.rs b/crates/nvisy-toolkit/src/redaction/registry.rs index 39cee3dd..7d3a56f0 100644 --- a/crates/nvisy-toolkit/src/redaction/registry.rs +++ b/crates/nvisy-toolkit/src/redaction/registry.rs @@ -4,23 +4,23 @@ //! The registry exposes two independent indexes plus an optional //! catch-all: //! -//! - **`by_kind`** — keyed by [`EntityKind`]. The dispatch the -//! toolkit-only pipeline uses: "this entity has kind -//! `EmailAddress`; what operator do I run?". Populated by callers -//! with `insert_kind`. +//! - **`by_label`** — keyed by [`EntityLabelRef`]. The dispatch +//! the toolkit-only pipeline uses: "this entity has label +//! `email_address`; what operator do I run?". Populated by +//! callers with `insert_label`. //! - **`by_id`** — keyed by [`AnonymizerId`]. The dispatch the //! document-side policy layer uses when a policy rule resolves to //! `Custom { name }` and the named operator must be looked up by //! string id. Populated by callers with `insert_id`. -//! - **`fallback`** — the operator to use when `by_kind.get(kind)` -//! misses. Optional; when absent, unregistered kinds skip. +//! - **`fallback`** — the operator to use when `by_label.get(label)` +//! misses. Optional; when absent, unregistered labels skip. //! //! The two indexes are independent: registering the same operator //! both by kind and by id is a deliberate call-site choice, not an //! automatic mirroring. //! //! ```ignore -//! use nvisy_core::entity::EntityKind; +//! use nvisy_core::entity::builtins; //! use nvisy_core::modality::Text; //! use nvisy_toolkit::redaction::anonymizer::{Mask, Redact, Replace}; //! use nvisy_toolkit::redaction::{AnonymizerId, RedactionRegistry}; @@ -28,8 +28,8 @@ //! const KMS_ENCRYPT: AnonymizerId = AnonymizerId::from_static("kms_encrypt"); //! //! let registry = RedactionRegistry::::new() -//! .insert_kind(EntityKind::EmailAddress, Replace::new("[EMAIL]")) -//! .insert_kind(EntityKind::PaymentCard, Mask::new('#').with_keep_suffix(4)) +//! .insert_label(builtins::EMAIL_ADDRESS.label_ref(), Replace::new("[EMAIL]")) +//! .insert_label(builtins::PAYMENT_CARD.label_ref(), Mask::new('#').with_keep_suffix(4)) //! .insert_id(KMS_ENCRYPT, MyKmsOperator::new(client)) //! .with_fallback(Redact); //! ``` @@ -40,7 +40,7 @@ use std::collections::HashMap; use std::sync::Arc; use nvisy_core::Result; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef}; use nvisy_core::extraction::DataAt; use nvisy_core::modality::Modality; use nvisy_core::redaction::Redactions; @@ -48,38 +48,39 @@ use nvisy_core::redaction::Redactions; use super::{Anonymizer, AnonymizerId}; /// Per-modality registry of [`Anonymizer`] instances, indexed by -/// both [`EntityKind`] (toolkit-side per-kind dispatch) and +/// both [`EntityLabelRef`] (toolkit-side per-label dispatch) and /// [`AnonymizerId`] (policy-side custom-operator resolution). /// /// [`Anonymizer`]: super::Anonymizer pub struct RedactionRegistry { - by_kind: HashMap>>, + by_label: HashMap>>, by_id: HashMap, Arc>>, fallback: Option>>, } impl RedactionRegistry { - /// Build an empty registry. Use [`insert_kind`], [`insert_id`], + /// Build an empty registry. Use [`insert_label`], [`insert_id`], /// or [`with_fallback`] to populate it. /// - /// [`insert_kind`]: Self::insert_kind + /// [`insert_label`]: Self::insert_label /// [`insert_id`]: Self::insert_id /// [`with_fallback`]: Self::with_fallback #[must_use] pub fn new() -> Self { Self { - by_kind: HashMap::new(), + by_label: HashMap::new(), by_id: HashMap::new(), fallback: None, } } - /// Register `op` as the operator the toolkit pipeline picks when - /// it encounters an entity of `kind`. Re-registering the same - /// kind replaces the previous instance. + /// Register `op` as the operator the toolkit pipeline picks + /// when it encounters an entity carrying `label`. + /// Re-registering the same label replaces the previous + /// instance. #[must_use] - pub fn insert_kind(mut self, kind: EntityKind, op: impl Anonymizer + 'static) -> Self { - self.by_kind.insert(kind, Arc::new(op)); + pub fn insert_label(mut self, label: EntityLabelRef, op: impl Anonymizer + 'static) -> Self { + self.by_label.insert(label, Arc::new(op)); self } @@ -102,13 +103,13 @@ impl RedactionRegistry { self } - /// Resolve an entity-kind to its registered operator, falling - /// back to the catch-all when no per-kind binding exists. - /// Returns `None` only when neither a per-kind operator nor a + /// Resolve an entity label to its registered operator, falling + /// back to the catch-all when no per-label binding exists. + /// Returns `None` only when neither a per-label operator nor a /// fallback was registered. #[must_use] - pub fn resolve(&self, kind: EntityKind) -> Option<&Arc>> { - self.by_kind.get(&kind).or(self.fallback.as_ref()) + pub fn resolve(&self, label: &EntityLabelRef) -> Option<&Arc>> { + self.by_label.get(label).or(self.fallback.as_ref()) } /// Resolve an [`AnonymizerId`] to its registered operator. @@ -119,10 +120,10 @@ impl RedactionRegistry { self.by_id.get(id) } - /// Number of distinct entity-kinds registered. + /// Number of distinct entity labels registered. #[must_use] - pub fn kinds_len(&self) -> usize { - self.by_kind.len() + pub fn labels_len(&self) -> usize { + self.by_label.len() } /// Number of distinct ids registered. @@ -134,7 +135,7 @@ impl RedactionRegistry { /// `true` when neither index nor a fallback are registered. #[must_use] pub fn is_empty(&self) -> bool { - self.by_kind.is_empty() && self.by_id.is_empty() && self.fallback.is_none() + self.by_label.is_empty() && self.by_id.is_empty() && self.fallback.is_none() } /// Run [`resolve`] + [`Anonymizer::apply`] over every entity, and @@ -147,7 +148,7 @@ impl RedactionRegistry { /// never sees the whole document. Entities whose location can't /// be resolved by the source are skipped. /// - /// Entities whose kind has no per-kind operator and where no + /// Entities whose label has no per-label operator and where no /// fallback was registered are skipped (counted as a debug-level /// tracing event); the rest are applied in iteration order. /// @@ -168,7 +169,7 @@ impl RedactionRegistry { let mut skipped = 0usize; let mut unresolved = 0usize; for entity in entities { - let Some(op) = self.resolve(entity.entity_kind) else { + let Some(op) = self.resolve(&entity.label) else { skipped += 1; continue; }; @@ -183,7 +184,7 @@ impl RedactionRegistry { tracing::debug!( target: "nvisy_toolkit::redaction::registry", skipped, - "RedactionRegistry::apply_all skipped entities with no per-kind operator and no fallback", + "RedactionRegistry::apply_all skipped entities with no per-label operator and no fallback", ); } if unresolved > 0 { @@ -206,7 +207,7 @@ impl Default for RedactionRegistry { impl Clone for RedactionRegistry { fn clone(&self) -> Self { Self { - by_kind: self.by_kind.clone(), + by_label: self.by_label.clone(), by_id: self.by_id.clone(), fallback: self.fallback.clone(), } @@ -216,7 +217,7 @@ impl Clone for RedactionRegistry { impl std::fmt::Debug for RedactionRegistry { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("RedactionRegistry") - .field("kinds", &self.by_kind.len()) + .field("labels", &self.by_label.len()) .field("ids", &self.by_id.len()) .field("fallback", &self.fallback.is_some()) .finish() @@ -226,7 +227,7 @@ impl std::fmt::Debug for RedactionRegistry { #[cfg(test)] mod tests { - use nvisy_core::entity::Entity; + use nvisy_core::entity::{Entity, builtins}; use nvisy_core::modality::{Text, TextData, TextLocation}; use nvisy_core::primitive::Confidence; @@ -263,9 +264,9 @@ mod tests { } } - fn entity(kind: EntityKind, start: usize, end: usize) -> Entity { + fn entity(label: EntityLabelRef, start: usize, end: usize) -> Entity { Entity::::builder() - .with_entity_kind(kind) + .with_label(label) .with_location(TextLocation::new(start, end)) .with_confidence(Confidence::new(0.9).unwrap()) .build() @@ -276,16 +277,18 @@ mod tests { fn empty_registry_resolves_nothing() { let r = RedactionRegistry::::new(); assert!(r.is_empty()); - assert!(r.resolve(EntityKind::EmailAddress).is_none()); + assert!(r.resolve(&builtins::EMAIL_ADDRESS.label_ref()).is_none()); assert!(r.resolve_id(&AnonymizerId::from_static("kms")).is_none()); } #[test] - fn insert_kind_then_resolve_returns_operator() { - let r = RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, StubAnonymizer("[EMAIL]")); - assert_eq!(r.kinds_len(), 1); - assert!(r.resolve(EntityKind::EmailAddress).is_some()); + fn insert_label_then_resolve_returns_operator() { + let r = RedactionRegistry::::new().insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + StubAnonymizer("[EMAIL]"), + ); + assert_eq!(r.labels_len(), 1); + assert!(r.resolve(&builtins::EMAIL_ADDRESS.label_ref()).is_some()); } #[test] @@ -297,30 +300,34 @@ mod tests { } #[test] - fn fallback_covers_unregistered_kinds() { + fn fallback_covers_unregistered_labels() { let r = RedactionRegistry::::new().with_fallback(StubAnonymizer("[REDACTED]")); - assert!(r.resolve(EntityKind::PaymentCard).is_some()); + assert!(r.resolve(&builtins::PAYMENT_CARD.label_ref()).is_some()); } #[test] - fn per_kind_wins_over_fallback() { + fn per_label_wins_over_fallback() { let r = RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, StubAnonymizer("[EMAIL]")) + .insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + StubAnonymizer("[EMAIL]"), + ) .with_fallback(StubAnonymizer("[OTHER]")); - // Both resolve, but per-kind takes precedence — exercised - // indirectly via apply_all below. - assert!(r.resolve(EntityKind::EmailAddress).is_some()); - assert!(r.resolve(EntityKind::PaymentCard).is_some()); + assert!(r.resolve(&builtins::EMAIL_ADDRESS.label_ref()).is_some()); + assert!(r.resolve(&builtins::PAYMENT_CARD.label_ref()).is_some()); } #[tokio::test] - async fn apply_all_uses_per_kind_with_fallback() { + async fn apply_all_uses_per_label_with_fallback() { let r = RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, StubAnonymizer("[EMAIL]")) + .insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + StubAnonymizer("[EMAIL]"), + ) .with_fallback(StubAnonymizer("[OTHER]")); let entities = [ - entity(EntityKind::EmailAddress, 0, 5), - entity(EntityKind::PaymentCard, 6, 10), + entity(builtins::EMAIL_ADDRESS.label_ref(), 0, 5), + entity(builtins::PAYMENT_CARD.label_ref(), 6, 10), ]; let source = StubSource("abcdefghij".to_owned()); let rs = r.apply_all(entities.iter(), &source).await.unwrap(); @@ -332,11 +339,13 @@ mod tests { #[tokio::test] async fn apply_all_skips_unmatched_entities_without_fallback() { - let r = RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, StubAnonymizer("[EMAIL]")); + let r = RedactionRegistry::::new().insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + StubAnonymizer("[EMAIL]"), + ); let entities = [ - entity(EntityKind::EmailAddress, 0, 5), - entity(EntityKind::PaymentCard, 6, 10), + entity(builtins::EMAIL_ADDRESS.label_ref(), 0, 5), + entity(builtins::PAYMENT_CARD.label_ref(), 6, 10), ]; let source = StubSource("abcdefghij".to_owned()); let rs = r.apply_all(entities.iter(), &source).await.unwrap(); diff --git a/crates/nvisy-toolkit/tests/codec_e2e_csv.rs b/crates/nvisy-toolkit/tests/codec_e2e_csv.rs index 07328d0c..a39429ac 100644 --- a/crates/nvisy-toolkit/tests/codec_e2e_csv.rs +++ b/crates/nvisy-toolkit/tests/codec_e2e_csv.rs @@ -15,7 +15,7 @@ mod fixtures; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use crate::fixtures::{Fixture, assert_pii_removed, assert_tabular_entity, assert_tokens_present}; @@ -32,18 +32,58 @@ async fn csv_codec_detects_and_redacts() { // Header row 0 is name,email,phone,card,iban,ssn,host; // data rows are 1 (Alice) and 2 (Bob). for (kind, row, col, cell) in [ - (EntityKind::EmailAddress, 1, 1, "alice.johnson@example.com"), - (EntityKind::EmailAddress, 2, 1, "bob.smith@example.com"), - (EntityKind::PhoneNumber, 1, 2, "+1 (415) 555-0142"), - (EntityKind::PhoneNumber, 2, 2, "+1 (510) 555-0199"), - (EntityKind::PaymentCard, 1, 3, "4111 1111 1111 1111"), - (EntityKind::PaymentCard, 2, 3, "5555 5555 5555 4444"), - (EntityKind::Iban, 1, 4, "GB29 NWBK 6016 1331 9268 19"), - (EntityKind::Iban, 2, 4, "DE89 3704 0044 0532 0130 00"), - (EntityKind::GovernmentId, 1, 5, "123-45-6789"), - (EntityKind::GovernmentId, 2, 5, "234-56-7890"), - (EntityKind::IpAddress, 1, 6, "192.168.1.42"), - (EntityKind::IpAddress, 2, 6, "10.0.0.7"), + ( + builtins::EMAIL_ADDRESS.label_ref(), + 1, + 1, + "alice.johnson@example.com", + ), + ( + builtins::EMAIL_ADDRESS.label_ref(), + 2, + 1, + "bob.smith@example.com", + ), + ( + builtins::PHONE_NUMBER.label_ref(), + 1, + 2, + "+1 (415) 555-0142", + ), + ( + builtins::PHONE_NUMBER.label_ref(), + 2, + 2, + "+1 (510) 555-0199", + ), + ( + builtins::PAYMENT_CARD.label_ref(), + 1, + 3, + "4111 1111 1111 1111", + ), + ( + builtins::PAYMENT_CARD.label_ref(), + 2, + 3, + "5555 5555 5555 4444", + ), + ( + builtins::IBAN.label_ref(), + 1, + 4, + "GB29 NWBK 6016 1331 9268 19", + ), + ( + builtins::IBAN.label_ref(), + 2, + 4, + "DE89 3704 0044 0532 0130 00", + ), + (builtins::GOVERNMENT_ID.label_ref(), 1, 5, "123-45-6789"), + (builtins::GOVERNMENT_ID.label_ref(), 2, 5, "234-56-7890"), + (builtins::IP_ADDRESS.label_ref(), 1, 6, "192.168.1.42"), + (builtins::IP_ADDRESS.label_ref(), 2, 6, "10.0.0.7"), ] { assert_tabular_entity(cell, &outcome.entities, kind, row, col, cell); } diff --git a/crates/nvisy-toolkit/tests/codec_e2e_html.rs b/crates/nvisy-toolkit/tests/codec_e2e_html.rs index 04ada77c..094d4441 100644 --- a/crates/nvisy-toolkit/tests/codec_e2e_html.rs +++ b/crates/nvisy-toolkit/tests/codec_e2e_html.rs @@ -21,7 +21,7 @@ mod fixtures; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use crate::fixtures::{Fixture, assert_pii_removed, assert_tokens_present}; @@ -40,20 +40,20 @@ async fn html_codec_detects_and_redacts() { // per-needle slice assertion the txt/json tests use doesn't // translate here; presence-by-kind is the right shape. for expected in [ - EntityKind::EmailAddress, - EntityKind::PhoneNumber, - EntityKind::PaymentCard, - EntityKind::Iban, - EntityKind::GovernmentId, - EntityKind::IpAddress, + builtins::EMAIL_ADDRESS.label_ref(), + builtins::PHONE_NUMBER.label_ref(), + builtins::PAYMENT_CARD.label_ref(), + builtins::IBAN.label_ref(), + builtins::GOVERNMENT_ID.label_ref(), + builtins::IP_ADDRESS.label_ref(), ] { assert!( - outcome.entities.iter().any(|e| e.entity_kind == expected), + outcome.entities.iter().any(|e| e.label == expected), "expected at least one {expected:?} entity; got: {:?}", outcome .entities .iter() - .map(|e| e.entity_kind) + .map(|e| e.label.clone()) .collect::>() ); } diff --git a/crates/nvisy-toolkit/tests/codec_e2e_json.rs b/crates/nvisy-toolkit/tests/codec_e2e_json.rs index a56123fa..702d3f31 100644 --- a/crates/nvisy-toolkit/tests/codec_e2e_json.rs +++ b/crates/nvisy-toolkit/tests/codec_e2e_json.rs @@ -9,7 +9,7 @@ mod fixtures; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use crate::fixtures::{Fixture, assert_pii_removed, assert_text_entity, assert_tokens_present}; @@ -27,18 +27,21 @@ async fn json_codec_detects_and_redacts() { // its escape table — so slicing the fixture by an entity's // location yields the same bytes the recognizer matched. for (kind, needle) in [ - (EntityKind::EmailAddress, "alice.johnson@example.com"), - (EntityKind::EmailAddress, "bob.smith@example.com"), - (EntityKind::PhoneNumber, "+1 (415) 555-0142"), - (EntityKind::PhoneNumber, "+1 (510) 555-0199"), - (EntityKind::PaymentCard, "4111 1111 1111 1111"), - (EntityKind::PaymentCard, "5555 5555 5555 4444"), - (EntityKind::Iban, "GB29 NWBK 6016 1331 9268 19"), - (EntityKind::Iban, "DE89 3704 0044 0532 0130 00"), - (EntityKind::GovernmentId, "123-45-6789"), - (EntityKind::GovernmentId, "234-56-7890"), - (EntityKind::IpAddress, "192.168.1.42"), - (EntityKind::IpAddress, "10.0.0.7"), + ( + builtins::EMAIL_ADDRESS.label_ref(), + "alice.johnson@example.com", + ), + (builtins::EMAIL_ADDRESS.label_ref(), "bob.smith@example.com"), + (builtins::PHONE_NUMBER.label_ref(), "+1 (415) 555-0142"), + (builtins::PHONE_NUMBER.label_ref(), "+1 (510) 555-0199"), + (builtins::PAYMENT_CARD.label_ref(), "4111 1111 1111 1111"), + (builtins::PAYMENT_CARD.label_ref(), "5555 5555 5555 4444"), + (builtins::IBAN.label_ref(), "GB29 NWBK 6016 1331 9268 19"), + (builtins::IBAN.label_ref(), "DE89 3704 0044 0532 0130 00"), + (builtins::GOVERNMENT_ID.label_ref(), "123-45-6789"), + (builtins::GOVERNMENT_ID.label_ref(), "234-56-7890"), + (builtins::IP_ADDRESS.label_ref(), "192.168.1.42"), + (builtins::IP_ADDRESS.label_ref(), "10.0.0.7"), ] { assert_text_entity(FIXTURE.source, &outcome.entities, kind, needle); } diff --git a/crates/nvisy-toolkit/tests/codec_e2e_txt.rs b/crates/nvisy-toolkit/tests/codec_e2e_txt.rs index 75ef5865..e3425a0a 100644 --- a/crates/nvisy-toolkit/tests/codec_e2e_txt.rs +++ b/crates/nvisy-toolkit/tests/codec_e2e_txt.rs @@ -5,7 +5,7 @@ mod fixtures; -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use crate::fixtures::{Fixture, assert_pii_removed, assert_text_entity, assert_tokens_present}; @@ -20,12 +20,15 @@ async fn txt_codec_detects_and_redacts() { let outcome = FIXTURE.run_text_pipeline().await; for (kind, needle) in [ - (EntityKind::EmailAddress, "alice.johnson@example.com"), - (EntityKind::PhoneNumber, "+1 (415) 555-0142"), - (EntityKind::PaymentCard, "4111 1111 1111 1111"), - (EntityKind::Iban, "GB29 NWBK 6016 1331 9268 19"), - (EntityKind::GovernmentId, "123-45-6789"), - (EntityKind::IpAddress, "192.168.1.42"), + ( + builtins::EMAIL_ADDRESS.label_ref(), + "alice.johnson@example.com", + ), + (builtins::PHONE_NUMBER.label_ref(), "+1 (415) 555-0142"), + (builtins::PAYMENT_CARD.label_ref(), "4111 1111 1111 1111"), + (builtins::IBAN.label_ref(), "GB29 NWBK 6016 1331 9268 19"), + (builtins::GOVERNMENT_ID.label_ref(), "123-45-6789"), + (builtins::IP_ADDRESS.label_ref(), "192.168.1.42"), ] { assert_text_entity(FIXTURE.source, &outcome.entities, kind, needle); } diff --git a/crates/nvisy-toolkit/tests/fixtures/asserts.rs b/crates/nvisy-toolkit/tests/fixtures/asserts.rs index 5ec70c55..908ea913 100644 --- a/crates/nvisy-toolkit/tests/fixtures/asserts.rs +++ b/crates/nvisy-toolkit/tests/fixtures/asserts.rs @@ -1,22 +1,27 @@ //! Entity-presence and redaction-output assertion helpers for the //! codec E2E tests. -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::{Entity, EntityLabelRef, builtins}; use nvisy_core::modality::{Tabular, Text}; /// Assert at least one `Entity` of `kind` matches `needle` /// when its location is sliced against `source`. #[track_caller] -pub fn assert_text_entity(source: &str, entities: &[Entity], kind: EntityKind, needle: &str) { +pub fn assert_text_entity( + source: &str, + entities: &[Entity], + label: EntityLabelRef, + needle: &str, +) { let hit = entities .iter() - .any(|e| e.entity_kind == kind && &source[e.location.start..e.location.end] == needle); + .any(|e| e.label == label && &source[e.location.start..e.location.end] == needle); assert!( hit, - "expected `{needle}` as {kind:?}; got: {:?}", + "expected `{needle}` as {label:?}; got: {:?}", entities .iter() - .map(|e| (e.entity_kind, &source[e.location.start..e.location.end])) + .map(|e| (e.label.clone(), &source[e.location.start..e.location.end])) .collect::>() ); } @@ -27,13 +32,13 @@ pub fn assert_text_entity(source: &str, entities: &[Entity], kind: EntityK pub fn assert_tabular_entity( cell_value: &str, entities: &[Entity], - kind: EntityKind, + label: EntityLabelRef, row: u32, col: u32, needle: &str, ) { let hit = entities.iter().any(|e| { - if e.entity_kind != kind { + if e.label != label { return false; } if e.location.row_index != row || e.location.column_index != col { @@ -45,10 +50,14 @@ pub fn assert_tabular_entity( }); assert!( hit, - "expected `{needle}` as {kind:?} at ({row},{col}); got: {:?}", + "expected `{needle}` as {label:?} at ({row},{col}); got: {:?}", entities .iter() - .map(|e| (e.entity_kind, e.location.row_index, e.location.column_index)) + .map(|e| ( + e.label.clone(), + e.location.row_index, + e.location.column_index + )) .collect::>() ); } diff --git a/crates/nvisy-toolkit/tests/fixtures/registries.rs b/crates/nvisy-toolkit/tests/fixtures/registries.rs index 186c208d..08657054 100644 --- a/crates/nvisy-toolkit/tests/fixtures/registries.rs +++ b/crates/nvisy-toolkit/tests/fixtures/registries.rs @@ -1,7 +1,7 @@ //! Shared recognizer + redaction registry constructors and dedup //! params used by every codec E2E test. -use nvisy_core::entity::EntityKind; +use nvisy_core::entity::builtins; use nvisy_core::modality::Modality; use nvisy_core::primitive::ConfidenceThreshold; use nvisy_pattern::{PatternRecognizer, PatternRegistry}; @@ -21,7 +21,7 @@ pub fn shipped_recognizer() -> PatternRecognizer { /// patterns can emit is mapped to a deterministic operator so test /// assertions can spot-check the replacement tokens. /// -/// - emails, phones, IBANs, government ids, IPs → `[{entity_kind}]` +/// - emails, phones, IBANs, government ids, IPs → `[{label}]` /// - payment cards → `Mask::stars()` (digits masked, no token) pub fn redaction_registry() -> RedactionRegistry where @@ -30,12 +30,21 @@ where Mask: Anonymizer, { RedactionRegistry::::new() - .insert_kind(EntityKind::EmailAddress, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::PhoneNumber, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::Iban, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::GovernmentId, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::IpAddress, Replace::new("[{entity_kind}]")) - .insert_kind(EntityKind::PaymentCard, Mask::stars()) + .insert_label( + builtins::EMAIL_ADDRESS.label_ref(), + Replace::new("[{label}]"), + ) + .insert_label( + builtins::PHONE_NUMBER.label_ref(), + Replace::new("[{label}]"), + ) + .insert_label(builtins::IBAN.label_ref(), Replace::new("[{label}]")) + .insert_label( + builtins::GOVERNMENT_ID.label_ref(), + Replace::new("[{label}]"), + ) + .insert_label(builtins::IP_ADDRESS.label_ref(), Replace::new("[{label}]")) + .insert_label(builtins::PAYMENT_CARD.label_ref(), Mask::stars()) } /// Standard dedup params: a `0.5` confidence threshold drops the diff --git a/crates/nvisy-toolkit/tests/recognition_registry.rs b/crates/nvisy-toolkit/tests/recognition_registry.rs index 8b4bb869..1556e2d8 100644 --- a/crates/nvisy-toolkit/tests/recognition_registry.rs +++ b/crates/nvisy-toolkit/tests/recognition_registry.rs @@ -22,7 +22,7 @@ use std::env; -use nvisy_core::entity::{Entity, EntityKind}; +use nvisy_core::entity::Entity; use nvisy_core::modality::{Text, TextData}; use nvisy_core::recognition::RecognizerInput; use nvisy_llm::backend::rig::RigBackend; @@ -55,7 +55,12 @@ fn build_registry() -> RecognizerRegistry { let ner = NerRecognizer::builder() .with_name("ner") .with_engine(bento_backend) - .with_supported_kinds(EntityKind::all().collect::>()) + .with_supported_labels( + nvisy_core::entity::EntityLabelCatalog::with_builtins() + .iter() + .map(|l| l.label_ref()) + .collect::>(), + ) .build() .expect("ner recognizer builds"); From 0814a80948225281735fec4d94564b0b07c1e2e7 Mon Sep 17 00:00:00 2001 From: Oleh Martsokha Date: Sat, 13 Jun 2026 18:42:06 +0200 Subject: [PATCH 2/3] feat(engine): per-request EntityLabelCatalog supplied via policies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Labels are now per-request rather than per-deployment. Each submitted `Policy` carries the labels it operates over; the engine unions them into an `EntityLabelCatalog` driving recognizer dispatch and selector tag matching for that request. Engine flow: - New `Policy::labels: Vec` field. `unify_labels` unions every policy's labels with conflict detection (same name with non-equal description/tags → HTTP 400). - `validate_selector_labels` ensures every selector label name exists in the unioned catalog. - `DetectionConfig::build_for_request(catalog)` builds a fresh `RecognizerRegistry` per request: patterns/dictionaries are filtered against the catalog (entries with unregistered labels never run), NER's zero-shot label list is sourced from the catalog. - `EntitySelector::matches(entity, catalog)` dereferences tags against the request catalog. The `BUILTIN_CATALOG: LazyLock` in selector.rs is gone. Deletions: - Engine-startup `Arc` and `DetectionEngineState.recognizer_registry` — replaced by an `Arc` template. - `Detection` plan node + `cfg.labels.contains(...)` post-filter in `detection.rs` — superseded by registry-construction-time filtering. - `default_text_labels()` — no more hardcoded label allowlist. Plus a workspace-wide style sweep: inline `tracing::*` macros and attributes (`#[tracing::instrument(...)]`, `tracing::Level::INFO`) and convert remaining collapsed-form rustdoc links `[X][path]` to the reference form `[X]` + `[X]: path` at the bottom of doc blocks. Co-Authored-By: Claude Opus 4.7 --- .../nvisy-core/src/entity/label/builtins.rs | 8 +- .../src/entity/label/entity_label.rs | 8 +- .../src/entity/label/entity_label_ref.rs | 17 +- crates/nvisy-core/src/entity/label/mod.rs | 17 +- crates/nvisy-engine/src/core/policy_store.rs | 43 +++- crates/nvisy-engine/src/core/shared.rs | 8 + crates/nvisy-engine/src/phases/detection.rs | 26 +-- .../nvisy-engine/src/phases/redaction/mod.rs | 3 +- .../src/phases/redaction/phase.rs | 6 + .../src/pipeline/config/detection/mod.rs | 65 +++--- .../src/pipeline/config/detection/plan.rs | 50 ---- .../nvisy-engine/src/pipeline/config/mod.rs | 14 +- .../src/pipeline/detection/pipeline.rs | 46 ++-- crates/nvisy-engine/src/pipeline/engine.rs | 22 +- crates/nvisy-engine/src/pipeline/mod.rs | 6 +- crates/nvisy-engine/src/pipeline/plan.rs | 4 +- .../src/pipeline/redaction/pipeline.rs | 6 +- crates/nvisy-engine/src/policy/mod.rs | 220 ++++++++++++++++++ crates/nvisy-engine/src/policy/selector.rs | 30 +-- crates/nvisy-ner/src/backend/mod.rs | 10 +- crates/nvisy-ner/src/backend/ner_span.rs | 7 +- crates/nvisy-ocr/src/extraction/extractor.rs | 5 +- .../nvisy-pattern/src/recognition/registry.rs | 20 ++ .../src/middleware/observability.rs | 9 +- crates/nvisy-stt/src/extraction/mod.rs | 3 +- 25 files changed, 447 insertions(+), 206 deletions(-) delete mode 100644 crates/nvisy-engine/src/pipeline/config/detection/plan.rs diff --git a/crates/nvisy-core/src/entity/label/builtins.rs b/crates/nvisy-core/src/entity/label/builtins.rs index 63fe3d94..8c7bd9d9 100644 --- a/crates/nvisy-core/src/entity/label/builtins.rs +++ b/crates/nvisy-core/src/entity/label/builtins.rs @@ -7,9 +7,11 @@ //! separate enum. //! //! The `BUILT_INS` slice indexes every constant for the -//! [`EntityLabelCatalog::with_builtins`][super::EntityLabelCatalog::with_builtins] -//! constructor; the constants themselves are public and reachable -//! by name (e.g. `builtins::PERSON_NAME`). +//! [`EntityLabelCatalog::with_builtins`] constructor; the +//! constants themselves are public and reachable by name (e.g. +//! `builtins::PERSON_NAME`). +//! +//! [`EntityLabelCatalog::with_builtins`]: super::EntityLabelCatalog::with_builtins use std::sync::LazyLock; diff --git a/crates/nvisy-core/src/entity/label/entity_label.rs b/crates/nvisy-core/src/entity/label/entity_label.rs index 329f5189..2503bf4a 100644 --- a/crates/nvisy-core/src/entity/label/entity_label.rs +++ b/crates/nvisy-core/src/entity/label/entity_label.rs @@ -109,9 +109,11 @@ impl EntityLabel { self.tags.iter().any(|t| t == tag) } - /// Construct a name-only [`EntityLabelRef`][super::EntityLabelRef] - /// handle to this label. Clones the underlying [`HipStr`] - /// (a refcount bump for `from_static` labels — no allocation). + /// Construct a name-only [`EntityLabelRef`] handle to this + /// label. Clones the underlying [`HipStr`] (a refcount bump + /// for `from_static` labels — no allocation). + /// + /// [`EntityLabelRef`]: super::EntityLabelRef #[must_use] pub fn label_ref(&self) -> EntityLabelRef { EntityLabelRef::from(self.name.clone()) diff --git a/crates/nvisy-core/src/entity/label/entity_label_ref.rs b/crates/nvisy-core/src/entity/label/entity_label_ref.rs index 917519ef..43cfea6d 100644 --- a/crates/nvisy-core/src/entity/label/entity_label_ref.rs +++ b/crates/nvisy-core/src/entity/label/entity_label_ref.rs @@ -9,10 +9,10 @@ //! //! Catalog-side metadata (description, tags) lives on //! [`EntityLabel`] and is dereferenced through -//! [`EntityLabelCatalog::lookup`][catalog-lookup] when a consumer needs it. +//! [`EntityLabelCatalog::lookup`] when a consumer needs it. //! //! [`EntityLabel`]: super::EntityLabel -//! [catalog-lookup]: super::EntityLabelCatalog::lookup +//! [`EntityLabelCatalog::lookup`]: super::EntityLabelCatalog::lookup use std::fmt; @@ -20,12 +20,15 @@ use hipstr::HipStr; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -/// Name-only handle to an [`EntityLabel`][super::EntityLabel]. -/// Cheap-clone wrapper around [`HipStr<'static>`]. +/// Name-only handle to an [`EntityLabel`]. Cheap-clone wrapper +/// around [`HipStr<'static>`]. /// -/// Carried on every [`Entity`][crate::entity::Entity] in place of -/// the full catalog metadata. Two refs are equal when their names -/// are equal byte-for-byte. +/// Carried on every [`Entity`] in place of the full catalog +/// metadata. Two refs are equal when their names are equal +/// byte-for-byte. +/// +/// [`EntityLabel`]: super::EntityLabel +/// [`Entity`]: crate::entity::Entity #[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, JsonSchema)] #[serde(transparent)] #[schemars(with = "String")] diff --git a/crates/nvisy-core/src/entity/label/mod.rs b/crates/nvisy-core/src/entity/label/mod.rs index abfd2aff..ea4c3324 100644 --- a/crates/nvisy-core/src/entity/label/mod.rs +++ b/crates/nvisy-core/src/entity/label/mod.rs @@ -6,17 +6,20 @@ //! tags). Authored once per label; consumed by selectors and //! audit-rendering tooling that need the metadata. //! - [`EntityLabelRef`] — name-only handle stored on every -//! detected [`Entity`][crate::entity::Entity]. Cheap-clone -//! wrapper around [`HipStr<'static>`][hipstr::HipStr]. -//! - [`EntityLabelCatalog`] — name-indexed lookup over a collection of -//! `EntityLabel`s. The workspace ships a built-in catalog -//! constructed from [`EntityLabelCatalog::with_builtins`]; consumers can -//! register custom labels alongside or instead of the -//! built-ins. +//! detected [`Entity`]. Cheap-clone wrapper around +//! [`HipStr<'static>`]. +//! - [`EntityLabelCatalog`] — name-indexed lookup over a +//! collection of `EntityLabel`s. The workspace ships a built-in +//! catalog constructed from [`EntityLabelCatalog::with_builtins`]; +//! consumers can register custom labels alongside or instead of +//! the built-ins. //! - [`builtins`] — every built-in `EntityLabel` constant //! (`builtins::PERSON_NAME`, `builtins::EMAIL_ADDRESS`, …) plus //! the internal `BUILT_INS` slice the catalog walks at //! construction time. +//! +//! [`Entity`]: crate::entity::Entity +//! [`HipStr<'static>`]: hipstr::HipStr pub mod builtins; mod entity_label; diff --git a/crates/nvisy-engine/src/core/policy_store.rs b/crates/nvisy-engine/src/core/policy_store.rs index 14b95157..c440d29d 100644 --- a/crates/nvisy-engine/src/core/policy_store.rs +++ b/crates/nvisy-engine/src/core/policy_store.rs @@ -64,6 +64,38 @@ impl PolicyStore { self.bucket_mut::().push(policy); } + /// Union every stored policy's [`Policy::labels`] into a single + /// [`EntityLabelCatalog`]. Used at redaction time to rebuild the + /// per-request catalog the detection pass already validated. + /// Conflicts here are impossible because the same union was + /// validated at detection-time submission. + pub(crate) fn catalog(&self) -> nvisy_core::entity::EntityLabelCatalog { + use crate::modality::{Audio, Image, Tabular, Text}; + + let mut catalog = nvisy_core::entity::EntityLabelCatalog::new(); + for p in self.chain::() { + for l in &p.labels { + catalog.insert(l.clone()); + } + } + for p in self.chain::() { + for l in &p.labels { + catalog.insert(l.clone()); + } + } + for p in self.chain::() { + for l in &p.labels { + catalog.insert(l.clone()); + } + } + for p in self.chain::