diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 68725f51..4ece2801 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -83,6 +83,34 @@ jobs: - name: Clippy run: cargo clippy --workspace -- -D warnings + docs: + name: Docs + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ env.RUST_TOOLCHAIN }} + + - name: Cache cargo registry and build + uses: actions/cache@v5 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + target + key: ${{ runner.os }}-cargo-docs-${{ hashFiles('**/Cargo.lock') }} + restore-keys: ${{ runner.os }}-cargo-docs- + + - name: Cargo doc + run: cargo doc --workspace --no-deps + env: + RUSTDOCFLAGS: "-D warnings" + test: name: Test runs-on: ubuntu-latest diff --git a/Cargo.lock b/Cargo.lock index 2d9b1229..857e87ed 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1978,6 +1978,15 @@ dependencies = [ "cc", ] +[[package]] +name = "iban_validate" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f695c602ee9bb55c4100c3e50559461ea3f49ca0d543cf770720af8165dc756" +dependencies = [ + "arrayvec", +] + [[package]] name = "icu_collections" version = "2.2.0" @@ -3131,6 +3140,7 @@ dependencies = [ "csv", "derive_builder", "derive_more", + "iban_validate", "nvisy-context", "nvisy-core", "phonenumber", diff --git a/Cargo.toml b/Cargo.toml index ee8f2000..8a6555cf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -102,6 +102,7 @@ unicode-normalization = { version = "0.1", features = [] } # Checksum / encoding bs58 = { version = "0.5", features = ["check"] } +iban_validate = { version = "5.0" } phonenumber = { version = "0.3", default-features = false } # Tabular document parsing diff --git a/crates/nvisy-context/src/io/tokens.rs b/crates/nvisy-context/src/io/tokens.rs index 6f491574..57262d6b 100644 --- a/crates/nvisy-context/src/io/tokens.rs +++ b/crates/nvisy-context/src/io/tokens.rs @@ -106,7 +106,7 @@ impl Token { /// borrows the underlying slice via [`as_slice`] and walks it by /// count when scoring the entity's neighbourhood. /// -/// [`Enhancer`]: super::Enhancer +/// [`Enhancer`]: crate::Enhancer /// [`as_slice`]: Tokens::as_slice #[derive(Debug, Clone, Default, PartialEq, Eq)] pub struct Tokens(Vec); diff --git a/crates/nvisy-core/src/entity/label/builtins.rs b/crates/nvisy-core/src/entity/label/builtins.rs index 8c7bd9d9..3d217fde 100644 --- a/crates/nvisy-core/src/entity/label/builtins.rs +++ b/crates/nvisy-core/src/entity/label/builtins.rs @@ -81,6 +81,7 @@ label!(pub SIGNATURE, "signature","Handwritten signature.", ["visual", "pii"]); label!(pub LOGO, "logo","Brand or organisation logo.", ["visual"]); label!(pub BARCODE, "barcode","Barcode or QR code.", ["visual"]); label!(pub ORGANIZATION_NAME, "organization_name","Organization or company name.", ["organization"]); +label!(pub COMPANY_ID, "company_id","Public company-registry identifier (Handelsregisternummer, Companies House number, etc.).", ["organization"]); label!(pub DEPARTMENT_NAME, "department_name","Department or business-unit name.", ["organization"]); label!(pub FACILITY_NAME, "facility_name","Physical facility or location name.", ["organization"]); label!(pub CASE_NUMBER, "case_number","Case, matter, or docket number.", ["organization"]); @@ -150,6 +151,7 @@ pub(super) static BUILT_INS: &[&LazyLock] = &[ &LOGO, &BARCODE, &ORGANIZATION_NAME, + &COMPANY_ID, &DEPARTMENT_NAME, &FACILITY_NAME, &CASE_NUMBER, diff --git a/crates/nvisy-pattern/Cargo.toml b/crates/nvisy-pattern/Cargo.toml index 94c69b56..9a505f14 100644 --- a/crates/nvisy-pattern/Cargo.toml +++ b/crates/nvisy-pattern/Cargo.toml @@ -38,17 +38,16 @@ derive_more = { workspace = true, features = ["from"] } # Async runtime and parallelism async-trait = { workspace = true, features = [] } -# Text processing (regex + Aho-Corasick literal matching) +# Text processing regex = { workspace = true, features = [] } aho-corasick = { workspace = true, features = [] } -# Tabular document parsing (dictionary loading from CSV) +# Tabular document parsing csv = { workspace = true, features = [] } -# Base58Check decoder for the crypto.btc validator +# Checksum / encoding bs58 = { workspace = true, features = ["check"] } - -# Region-aware phone-number parsing for the phone validator +iban_validate = { workspace = true } phonenumber = { workspace = true } [dev-dependencies] diff --git a/crates/nvisy-pattern/assets/NOTICE.md b/crates/nvisy-pattern/assets/PRESIDIO.md similarity index 71% rename from crates/nvisy-pattern/assets/NOTICE.md rename to crates/nvisy-pattern/assets/PRESIDIO.md index 161792e5..1b810fdb 100644 --- a/crates/nvisy-pattern/assets/NOTICE.md +++ b/crates/nvisy-pattern/assets/PRESIDIO.md @@ -1,13 +1,16 @@ -# Third-party attribution: shipped pattern assets +# Presidio attribution -Several shipped pattern TOMLs under this directory carry regular -expressions adapted from [Microsoft Presidio][presidio] -(`microsoft/presidio`, MIT licensed), specifically the +Several shipped pattern TOMLs under `patterns/` carry regular +expressions ported or adapted from [Microsoft Presidio][presidio] +(`microsoft/presidio`, MIT-licensed) — specifically the `presidio-analyzer/presidio_analyzer/predefined_recognizers/` -classes referenced inline in each TOML's leading comment. +classes. Validators (Luhn, IBAN mod-97, ABA, DEA, NPI, NHS, +NINO, etc.) were re-implemented in Rust from the same upstream +algorithms. -The Presidio MIT license text is reproduced below, per its -`Permission notice` clause. +The Presidio MIT license text is reproduced below to satisfy its +"include this permission notice in all copies or substantial +portions" clause. [presidio]: https://github.com/microsoft/presidio diff --git a/crates/nvisy-pattern/assets/README.md b/crates/nvisy-pattern/assets/README.md new file mode 100644 index 00000000..a41af49b --- /dev/null +++ b/crates/nvisy-pattern/assets/README.md @@ -0,0 +1,94 @@ +# Shipped asset tree + +The `nvisy-pattern` crate compiles every TOML and term-source +file under this directory into the binary via `include_str!`, +so adding a pattern or dictionary is as simple as: + +1. Drop the asset into the right subtree. +2. Wire a `shipped_pattern!` / `shipped_dictionary!` accessor + in `src/shipped/{patterns,dictionaries}/.rs`. +3. Append the accessor to the sub-module's `all()`. + +The recognizer's per-call language and country fields filter +patterns + dictionaries at runtime — see +[`RecognizerInput::applies_to_language`] and +[`RecognizerInput::applies_to_country`]. + +## Layout + +``` +assets/ + patterns/ + world/ jurisdiction-agnostic regex patterns + contact/ email, phone, url + credentials/ aws, github, stripe, generic api, private key + finance/ credit card, iban, swift, btc, eth + network/ ipv4, ipv6, mac + personal/ date of birth, datetime + us/ US-jurisdiction patterns + identity/ ssn, itin, drivers_license, passport, postal_code + finance/ bank_routing, bank_account + health/ npi, mbi, medical_license (DEA) + uk/ UK-jurisdiction patterns + identity/ nhs, nino, driving_licence, passport + contact/ postcode + vehicle/ registration + + dictionaries/ + world/ universal: brand names + codes + finance/ cryptocurrencies (BTC, ETH, Bitcoin, …) + en/ English-language terms + finance/ currencies (USD, US Dollar, EUR, …) + personal/ languages, nationalities, religions +``` + +Each pattern is a TOML file (`.toml`). Each dictionary +pairs a TOML metadata sidecar with a term source: +`.csv` for multi-column lists (term + alias columns with +per-column scores), `.txt` for one-per-line lists. + +## Scoring conventions + +Scores are baseline confidence — the context enhancer (in +`nvisy-context`) lifts them when configured keywords appear +nearby. The toolkit's default confidence threshold is `0.35`; +anything below needs context boost or an out-of-band hint +(CSV column header, JSON object key, HTML parent text) to +clear it. + +| Tier | Score | Use | +|------|-------|-----| +| Strong | 0.95–0.98 | Branded credential headers (`AKIA…`, `-----BEGIN PRIVATE KEY-----`, `gh[pousr]_…`) | +| Solid | 0.4–0.5 | Format with a checksum or restrictive structure (IBAN, NHS, NPI, MBI, IPv4, MAC, IPv6) | +| Loose | 0.3 | Brand-aware with weak structural specificity (credit_card, dictionaries) | +| Weak | 0.1 | Generic shape that *requires* context to clear threshold (passport, postal_code, DoB) | +| Trace | 0.05 | Last-resort generic regex (bank_account `\b\d{8,17}\b`) | + +The targets mirror Microsoft Presidio's deliberately-conservative +baselines — most of Presidio's predefined recognizers sit in +0.1–0.5 because the context enhancer is expected to lift hits +to 0.6+ when surrounding tokens match. + +## Validators + +A pattern variant can declare `validator = ""` to drop +matches that fail a post-match structural check. Built-in +names (resolved via `ValidatorRegistry::builtin`): + +- Universal: `luhn`, `iban`, `phone`, `date`, `crypto.btc` +- US: `us.ssn`, `us.aba_routing`, `us.npi`, `us.dea_number`, + `us.postal_code` +- UK: `uk.nhs`, `uk.nino`, `uk.driving_licence`, + `uk.vehicle_registration` + +Each lives in `src/validators/` under the matching submodule. + +## Attribution + +Many patterns + validators are ports of upstream Microsoft +Presidio recognizers. See [`PRESIDIO.md`](PRESIDIO.md) for the +MIT-license attribution and the upstream class references that +each adapted TOML's leading comment links to. + +[`RecognizerInput::applies_to_language`]: ../../nvisy-core/src/recognition/input.rs +[`RecognizerInput::applies_to_country`]: ../../nvisy-core/src/recognition/input.rs diff --git a/crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.csv b/crates/nvisy-pattern/assets/dictionaries/en/finance/currencies.csv similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.csv rename to crates/nvisy-pattern/assets/dictionaries/en/finance/currencies.csv diff --git a/crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.toml b/crates/nvisy-pattern/assets/dictionaries/en/finance/currencies.toml similarity index 55% rename from crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.toml rename to crates/nvisy-pattern/assets/dictionaries/en/finance/currencies.toml index 4b25dda3..ee86a7d2 100644 --- a/crates/nvisy-pattern/assets/dictionaries/world/finance/currencies.toml +++ b/crates/nvisy-pattern/assets/dictionaries/en/finance/currencies.toml @@ -1,3 +1,4 @@ name = "currencies" label = "currency" -score = 0.85 +languages = ["en"] +score = 0.4 diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/languages.csv b/crates/nvisy-pattern/assets/dictionaries/en/personal/languages.csv similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/languages.csv rename to crates/nvisy-pattern/assets/dictionaries/en/personal/languages.csv diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/languages.toml b/crates/nvisy-pattern/assets/dictionaries/en/personal/languages.toml similarity index 82% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/languages.toml rename to crates/nvisy-pattern/assets/dictionaries/en/personal/languages.toml index 0a7e0aee..3f530fb5 100644 --- a/crates/nvisy-pattern/assets/dictionaries/world/personal/languages.toml +++ b/crates/nvisy-pattern/assets/dictionaries/en/personal/languages.toml @@ -1,7 +1,8 @@ name = "languages" label = "language" +languages = ["en"] # column 0 = long-form names (`English`, `Spanish`, ...) # column 1 = ISO 639-1 codes (`en`, `es`, ...) # column 2 = alternate long-form names (`Farsi` for Persian) -score = [0.85, 0.30, 0.85] +score = [0.4, 0.2, 0.4] diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.toml b/crates/nvisy-pattern/assets/dictionaries/en/personal/nationalities.toml similarity index 59% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.toml rename to crates/nvisy-pattern/assets/dictionaries/en/personal/nationalities.toml index 001301aa..76f2e553 100644 --- a/crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.toml +++ b/crates/nvisy-pattern/assets/dictionaries/en/personal/nationalities.toml @@ -1,3 +1,4 @@ name = "nationalities" label = "nationality" -score = 0.85 +languages = ["en"] +score = 0.4 diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.txt b/crates/nvisy-pattern/assets/dictionaries/en/personal/nationalities.txt similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/nationalities.txt rename to crates/nvisy-pattern/assets/dictionaries/en/personal/nationalities.txt diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/religions.toml b/crates/nvisy-pattern/assets/dictionaries/en/personal/religions.toml similarity index 55% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/religions.toml rename to crates/nvisy-pattern/assets/dictionaries/en/personal/religions.toml index eaf55b82..8205f97e 100644 --- a/crates/nvisy-pattern/assets/dictionaries/world/personal/religions.toml +++ b/crates/nvisy-pattern/assets/dictionaries/en/personal/religions.toml @@ -1,3 +1,4 @@ name = "religions" label = "religion" -score = 0.85 +languages = ["en"] +score = 0.4 diff --git a/crates/nvisy-pattern/assets/dictionaries/world/personal/religions.txt b/crates/nvisy-pattern/assets/dictionaries/en/personal/religions.txt similarity index 100% rename from crates/nvisy-pattern/assets/dictionaries/world/personal/religions.txt rename to crates/nvisy-pattern/assets/dictionaries/en/personal/religions.txt diff --git a/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml b/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml index 2d73da4f..2bbbc05d 100644 --- a/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml +++ b/crates/nvisy-pattern/assets/dictionaries/world/finance/cryptocurrencies.toml @@ -1,3 +1,3 @@ name = "cryptocurrencies" label = "currency" -score = 0.85 +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/au/finance/abn.toml b/crates/nvisy-pattern/assets/patterns/au/finance/abn.toml new file mode 100644 index 00000000..a316fbc3 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/au/finance/abn.toml @@ -0,0 +1,25 @@ +# Australian Business Number (ABN): 11-digit ID issued by the +# Australian Business Register. The leading digit is rewritten +# (-1 with 0→9) before a weighted sum that must be divisible by +# 89. The conventional rendering is `NN NNN NNN NNN`. + +name = "au-abn" +label = "company_id" +countries = ["AU"] +languages = ["en"] +context = [ + "australian business number", + "abn", + "abr", + "australian business register", +] + +[[variants]] +regex = '\b\d{2}\s\d{3}\s\d{3}\s\d{3}\b' +score = 0.4 +validator = "au.abn" + +[[variants]] +regex = '\b\d{11}\b' +score = 0.2 +validator = "au.abn" diff --git a/crates/nvisy-pattern/assets/patterns/au/finance/acn.toml b/crates/nvisy-pattern/assets/patterns/au/finance/acn.toml new file mode 100644 index 00000000..714b6fcf --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/au/finance/acn.toml @@ -0,0 +1,24 @@ +# Australian Company Number (ACN): 9-digit ID issued by ASIC. +# Weighted-sum mod-10 checksum, complement-to-10. Conventional +# rendering is `NNN NNN NNN`. + +name = "au-acn" +label = "company_id" +countries = ["AU"] +languages = ["en"] +context = [ + "australian company number", + "acn", + "asic", + "australian securities and investments commission", +] + +[[variants]] +regex = '\b\d{3}\s\d{3}\s\d{3}\b' +score = 0.4 +validator = "au.acn" + +[[variants]] +regex = '\b\d{9}\b' +score = 0.1 +validator = "au.acn" diff --git a/crates/nvisy-pattern/assets/patterns/au/health/medicare.toml b/crates/nvisy-pattern/assets/patterns/au/health/medicare.toml new file mode 100644 index 00000000..768e1c95 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/au/health/medicare.toml @@ -0,0 +1,25 @@ +# Australian Medicare card number: 10-digit ID where the first +# digit is in 2-6 and the 9th digit is a mod-10 weighted check. +# Rendering is `NNNN NNNNN N` (card number + individual reference) +# or `NNNN NNNNN N N` (with issue number). + +name = "au-medicare" +label = "insurance_id" +countries = ["AU"] +languages = ["en"] +context = [ + "medicare", + "medicare card", + "medicare number", + "services australia", +] + +[[variants]] +regex = '\b[2-6]\d{3}\s\d{5}\s\d\b' +score = 0.4 +validator = "au.medicare" + +[[variants]] +regex = '\b[2-6]\d{9}\b' +score = 0.2 +validator = "au.medicare" diff --git a/crates/nvisy-pattern/assets/patterns/au/identity/tfn.toml b/crates/nvisy-pattern/assets/patterns/au/identity/tfn.toml new file mode 100644 index 00000000..f048d327 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/au/identity/tfn.toml @@ -0,0 +1,24 @@ +# Australian Tax File Number (TFN): 9-digit ID issued by the +# Australian Taxation Office. Weighted mod-11 checksum. +# Rendering is `NNN NNN NNN`. + +name = "au-tfn" +label = "tax_id" +countries = ["AU"] +languages = ["en"] +context = [ + "tax file number", + "tfn", + "australian taxation office", + "ato", +] + +[[variants]] +regex = '\b\d{3}\s\d{3}\s\d{3}\b' +score = 0.4 +validator = "au.tfn" + +[[variants]] +regex = '\b\d{9}\b' +score = 0.1 +validator = "au.tfn" diff --git a/crates/nvisy-pattern/assets/patterns/ca/contact/postal_code.toml b/crates/nvisy-pattern/assets/patterns/ca/contact/postal_code.toml new file mode 100644 index 00000000..bc388a61 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/ca/contact/postal_code.toml @@ -0,0 +1,20 @@ +# Canadian postal code: `A1A 1A1` format where the letter +# alphabet excludes D, F, I, O, Q, U (and W, Z don't appear as +# the first letter — Canada Post Address Standard). + +name = "ca-postal-code" +label = "postal_code" +countries = ["CA"] +languages = ["en", "fr"] +context = [ + "postal code", + "code postal", + "canada post", + "postes canada", + "mailing address", + "adresse postale", +] + +[[variants]] +regex = '\b[ABCEGHJ-NPRSTVXY]\d[A-CEGHJ-NPRSTV-Z][ -]?\d[A-CEGHJ-NPRSTV-Z]\d\b' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/ca/identity/sin.toml b/crates/nvisy-pattern/assets/patterns/ca/identity/sin.toml new file mode 100644 index 00000000..d8707d97 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/ca/identity/sin.toml @@ -0,0 +1,32 @@ +# Canadian Social Insurance Number (SIN): 9 digits issued by +# Employment and Social Development Canada. First digit must be +# in `[1-7, 9]` (0 and 8 reserved). Luhn checksum. + +name = "ca-sin" +label = "government_id" +countries = ["CA"] +languages = ["en", "fr"] +context = [ + "sin", + "social insurance number", + "social insurance", + "nas", + "numéro d'assurance sociale", + "numéro nas", + "assurance sociale", +] + +[[variants]] +regex = '\b[1-79]\d{2} \d{3} \d{3}\b' +score = 0.5 +validator = "ca.sin" + +[[variants]] +regex = '\b[1-79]\d{2}-\d{3}-\d{3}\b' +score = 0.5 +validator = "ca.sin" + +[[variants]] +regex = '\b[1-79]\d{8}\b' +score = 0.2 +validator = "ca.sin" diff --git a/crates/nvisy-pattern/assets/patterns/de/contact/plz.toml b/crates/nvisy-pattern/assets/patterns/de/contact/plz.toml new file mode 100644 index 00000000..fc6b6d02 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/contact/plz.toml @@ -0,0 +1,21 @@ +# German Postleitzahl (PLZ): 5 digits, with `01000` and `99999` +# reserved as test sentinels and never assigned. + +name = "de-plz" +label = "postal_code" +countries = ["DE"] +languages = ["de"] +context = [ + "postleitzahl", + "plz", + "postanschrift", + "anschrift", + "lieferadresse", + "rechnungsadresse", + "postfach", +] + +[[variants]] +regex = '\b(0[1-9]\d{3}|[1-9]\d{4})\b' +score = 0.3 +validator = "de.plz" diff --git a/crates/nvisy-pattern/assets/patterns/de/finance/handelsregister.toml b/crates/nvisy-pattern/assets/patterns/de/finance/handelsregister.toml new file mode 100644 index 00000000..46d9b7bd --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/finance/handelsregister.toml @@ -0,0 +1,23 @@ +# German Handelsregisternummer: court-registered company ID with +# section prefix `HRA` (partnerships) or `HRB` (capital companies) +# followed by 1-6 digits. + +name = "de-handelsregister" +label = "company_id" +countries = ["DE"] +languages = ["de"] +context = [ + "handelsregister", + "handelsregisternummer", + "registernummer", + "registergericht", + "amtsgericht", + "hrb", + "hra", + "handelsregister-nr", + "hr-nummer", +] + +[[variants]] +regex = '\bHR[AB]\s*\d{1,6}\b' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/de/health/bsnr.toml b/crates/nvisy-pattern/assets/patterns/de/health/bsnr.toml new file mode 100644 index 00000000..fb98d757 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/health/bsnr.toml @@ -0,0 +1,29 @@ +# German Betriebsstättennummer (BSNR): 9-digit practice/site-of-care +# number assigned by the regional Kassenärztliche Vereinigung (KV) +# under § 75 Abs. 7 SGB V. Appears on Rezepte and billing records. + +name = "de-bsnr" +label = "medical_id" +countries = ["DE"] +languages = ["de"] +context = [ + "betriebsstättennummer", + "betriebsstätten-nummer", + "bsnr", + "betriebsstätte", + "praxisnummer", + "arztpraxis", + "praxis", + "kassenärztliche vereinigung", + "kv-nummer", + "kv nummer", + "praxisadresse", + "praxisstandort", + "behandlungsort", + "vertragsarztpraxis", +] + +[[variants]] +regex = '\b\d{9}\b' +score = 0.2 +validator = "de.bsnr" diff --git a/crates/nvisy-pattern/assets/patterns/de/health/health_insurance.toml b/crates/nvisy-pattern/assets/patterns/de/health/health_insurance.toml new file mode 100644 index 00000000..8c094cc4 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/health/health_insurance.toml @@ -0,0 +1,33 @@ +# German Krankenversicherungsnummer (KVNR): 10 characters — +# leading letter + 9 digits. Printed on the eGK (elektronische +# Gesundheitskarte). Check digit per § 290 SGB V Anlage 1. + +name = "de-health-insurance" +label = "insurance_id" +countries = ["DE"] +languages = ["de"] +context = [ + "krankenversicherungsnummer", + "krankenversichertennummer", + "versichertennummer", + "kvnr", + "krankenkasse", + "krankenversicherung", + "gesundheitskarte", + "egk", + "elektronische gesundheitskarte", + "gkv", + "gesetzliche krankenversicherung", + "krankenversicherungsausweis", + "versichertenausweis", + "versichertenkarte", + "aok", + "tkk", + "barmer", + "dak", +] + +[[variants]] +regex = '\b[A-Z]\d{9}\b' +score = 0.3 +validator = "de.health_insurance" diff --git a/crates/nvisy-pattern/assets/patterns/de/health/lanr.toml b/crates/nvisy-pattern/assets/patterns/de/health/lanr.toml new file mode 100644 index 00000000..b41d523b --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/health/lanr.toml @@ -0,0 +1,31 @@ +# German Lebenslange Arztnummer (LANR): 9-digit lifetime physician +# number assigned by the Kassenärztliche Vereinigung. Pos 7 is a +# check digit per the KBV Arztnummern-Richtlinie; pos 8-9 carry +# the Fachgruppe (specialty) code. + +name = "de-lanr" +label = "medical_id" +countries = ["DE"] +languages = ["de"] +context = [ + "arztnummer", + "lanr", + "lebenslange arztnummer", + "arzt-nr", + "arzt nr", + "arzt-nummer", + "vertragsarzt", + "kassenarzt", + "niedergelassener arzt", + "kbv", + "rezept", + "verschreibung", + "behandelnder arzt", + "hausarzt", + "facharzt", +] + +[[variants]] +regex = '\b\d{9}\b' +score = 0.3 +validator = "de.lanr" diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/fuehrerschein.toml b/crates/nvisy-pattern/assets/patterns/de/identity/fuehrerschein.toml new file mode 100644 index 00000000..c18693bd --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/fuehrerschein.toml @@ -0,0 +1,34 @@ +# German Führerscheinnummer (driving licence number): 11 +# alphanumeric characters in the EU-harmonized post-2013 format +# (FeV Anlage 8). Format is opaque — position 3 onward mixes +# letters and digits per issuing authority. The trailing check +# character's algorithm is not published by KBA, so no validator +# is wired. + +name = "de-fuehrerschein" +label = "drivers_license" +countries = ["DE"] +languages = ["de"] +context = [ + "führerscheinnummer", + "führerschein", + "fahrerlaubnis", + "fahrerlaubnisnummer", + "fahrerlaubnisklasse", + "führerscheininhaber", + "fev", + "kba", + "kraftfahrt-bundesamt", + "driving licence", + "driving license", + "driver's license", + "licence number", + "license number", + "dokument nr", + "dokument-nr", + "feld 5", +] + +[[variants]] +regex = '\b[A-Z]\d{2}[A-Z0-9]{6}\d[A-Z0-9]\b' +score = 0.35 diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/id_card.toml b/crates/nvisy-pattern/assets/patterns/de/identity/id_card.toml new file mode 100644 index 00000000..85fb9a20 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/id_card.toml @@ -0,0 +1,36 @@ +# German Personalausweis (national ID card). Two formats coexist: +# +# - nPA (since November 2010): 9 alphanumeric characters from the +# ICAO restricted charset (excludes A, B, D, E, I, O, Q, S, U) +# plus a trailing ICAO check digit. +# - Legacy (pre-2010): letter `T` + 8 digits. No ICAO checksum. + +name = "de-id-card" +label = "government_id" +countries = ["DE"] +languages = ["de"] +context = [ + "personalausweis", + "ausweis", + "personalausweisnummer", + "ausweisnummer", + "ausweisdokument", + "dokumentennummer", + "seriennummer", + "npa", + "neuer personalausweis", + "personalausweisgesetz", + "bundespersonalausweis", + "identity card", + "national id", +] + +[[variants]] +regex = '\b[CFGHJKLMNPRTVWXYZ][CFGHJKLMNPRTVWXYZ0-9]{7}[0-9]\b' +score = 0.4 +validator = "de.id_card" + +[[variants]] +regex = '\bT\d{8}\b' +score = 0.5 +validator = "de.id_card" diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/de/identity/passport.toml new file mode 100644 index 00000000..499b95a4 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/passport.toml @@ -0,0 +1,23 @@ +# German Reisepass: 9-character serial from the ICAO restricted +# charset (excludes A, B, D, E, I, O, Q, S, U) plus a trailing +# ICAO Doc 9303 check digit. Issued by the Bundesdruckerei. + +name = "de-passport" +label = "passport_number" +countries = ["DE"] +languages = ["de"] +context = [ + "passnummer", + "reisepassnummer", + "passport number", + "pass-nr", + "dokumentennummer", + "reisepass", + "reisedokument", + "bundesdruckerei", +] + +[[variants]] +regex = '\b[CFGHJKLMNPRTVWXYZ][CFGHJKLMNPRTVWXYZ0-9]{7}[0-9]\b' +score = 0.4 +validator = "de.passport" diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/social_security.toml b/crates/nvisy-pattern/assets/patterns/de/identity/social_security.toml new file mode 100644 index 00000000..2b1e5510 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/social_security.toml @@ -0,0 +1,25 @@ +# German Rentenversicherungsnummer (RVNR / SVNR): 12 characters +# encoding region + birth date + surname initial + serial + check +# digit per VKVV § 4 (Deutsche Rentenversicherung). + +name = "de-social-security" +label = "national_insurance_number" +countries = ["DE"] +languages = ["de"] +context = [ + "rentenversicherungsnummer", + "sozialversicherungsnummer", + "versicherungsnummer", + "rvnr", + "svnr", + "sv-nummer", + "rentenversicherung", + "deutsche rentenversicherung", + "drv", + "sozialversicherung", +] + +[[variants]] +regex = '\b\d{8}[A-Z]\d{3}\b' +score = 0.5 +validator = "de.social_security" diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/tax_id.toml b/crates/nvisy-pattern/assets/patterns/de/identity/tax_id.toml new file mode 100644 index 00000000..004c2922 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/tax_id.toml @@ -0,0 +1,24 @@ +# German Steueridentifikationsnummer (Steuer-IdNr): 11-digit +# lifetime tax identifier per Bundeszentralamt für Steuern. +# Check digit at pos 11 via ISO 7064 Mod 11, 10. No leading zero. + +name = "de-tax-id" +label = "tax_id" +countries = ["DE"] +languages = ["de"] +context = [ + "steueridentifikationsnummer", + "steuerliche identifikationsnummer", + "persönliche identifikationsnummer", + "idnr", + "steuer-idnr", + "steuer idnr", + "steueridentnummer", + "bzst", + "bundeszentralamt für steuern", +] + +[[variants]] +regex = '\b[1-9]\d{10}\b' +score = 0.4 +validator = "de.tax_id" diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/tax_number.toml b/crates/nvisy-pattern/assets/patterns/de/identity/tax_number.toml new file mode 100644 index 00000000..66e42d53 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/tax_number.toml @@ -0,0 +1,34 @@ +# German Steuernummer (regional tax number): 10 or 11 digits +# allocated per Finanzamt. Format varies by Bundesland and the +# Vereinheitlichtes Bundesschema, so three regex variants cover the +# common renderings. No checksum is publicly documented. + +name = "de-tax-number" +label = "tax_id" +countries = ["DE"] +languages = ["de"] +context = [ + "steuernummer", + "st-nr", + "stnr", + "st.-nr", + "steuer-nr", + "finanzamt", + "bundeseinheitliche steuernummer", + "vereinheitlichtes bundesschema", +] + +# Vereinheitlichtes Bundesschema: 13 digits. +[[variants]] +regex = '\b\d{13}\b' +score = 0.2 + +# Common rendering: 3/4/5 grouped form. +[[variants]] +regex = '\b\d{3}/\d{4}/\d{4}\b' +score = 0.4 + +# Common rendering: 2/3/5 grouped form (some Länder use this). +[[variants]] +regex = '\b\d{2,3}/\d{3}/\d{4,5}\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/de/identity/vat_id.toml b/crates/nvisy-pattern/assets/patterns/de/identity/vat_id.toml new file mode 100644 index 00000000..5c1549b4 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/identity/vat_id.toml @@ -0,0 +1,32 @@ +# German Umsatzsteuer-Identifikationsnummer (USt-IdNr): `DE` prefix +# + 9 digits. Used in B2B invoices and VAT registration. Validator +# applies the community-documented ISO 7064 Mod 11, 10 heuristic. +# Whitespace and dashes are stripped before validation. + +name = "de-vat-id" +label = "tax_id" +countries = ["DE"] +languages = ["de"] +context = [ + "umsatzsteuer-identifikationsnummer", + "umsatzsteueridentifikationsnummer", + "ust-idnr", + "ustidnr", + "ust idnr", + "ust-id", + "umsatzsteuer", + "mehrwertsteuer", + "mwst-id", + "vat number", + "vat id", +] + +[[variants]] +regex = '\bDE\d{9}\b' +score = 0.5 +validator = "de.vat_id" + +[[variants]] +regex = '\bDE[\s.\-]?\d{3}[\s.\-]?\d{3}[\s.\-]?\d{3}\b' +score = 0.4 +validator = "de.vat_id" diff --git a/crates/nvisy-pattern/assets/patterns/de/vehicle/kfz.toml b/crates/nvisy-pattern/assets/patterns/de/vehicle/kfz.toml new file mode 100644 index 00000000..61a7fe83 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/de/vehicle/kfz.toml @@ -0,0 +1,44 @@ +# German Kfz-Kennzeichen (license plate): 1-3 letter +# Unterscheidungszeichen (city/district code), optional +# Erkennungszeichen (1-2 letters), and 1-4 digit serial. +# Multiple variants cover hyphen, space, and Umlaut renderings. + +name = "de-kfz" +label = "license_plate" +countries = ["DE"] +languages = ["de"] +context = [ + "kennzeichen", + "kfz-kennzeichen", + "kfz kennzeichen", + "amtliches kennzeichen", + "fahrzeugschein", + "fahrzeugbrief", + "zulassung", + "nummernschild", +] + +# Canonical hyphen-separated rendering: BB-AB 1234. +[[variants]] +regex = '\b[A-ZÄÖÜ]{1,3}[-\s][A-Z]{1,2}\s?\d{1,4}H?E?\b' +score = 0.3 + +# ASCII-only rendering (commonly typed without Umlaut). +[[variants]] +regex = '\b[A-Z]{1,3}[-\s][A-Z]{1,2}\s?\d{1,4}H?E?\b' +score = 0.2 + +# Glued form without spaces: BBAB1234. +[[variants]] +regex = '\b[A-ZÄÖÜ]{1,3}[A-Z]{1,2}\d{1,4}H?E?\b' +score = 0.2 + +# Sondercode for diplomats and federal authorities (leading zero). +[[variants]] +regex = '\b0\s?\d{1,4}\b' +score = 0.1 + +# Saisonkennzeichen with month range suffix. +[[variants]] +regex = '\b[A-ZÄÖÜ]{1,3}[-\s][A-Z]{1,2}\s?\d{1,4}\s\d{2}\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/es/contact/codigo_postal.toml b/crates/nvisy-pattern/assets/patterns/es/contact/codigo_postal.toml new file mode 100644 index 00000000..6f29396c --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/es/contact/codigo_postal.toml @@ -0,0 +1,19 @@ +# Spanish código postal: 5 digits where the first two are the +# province code (01-52, including Ceuta=51 and Melilla=52). Codes +# 53-99 are unassigned. + +name = "es-codigo-postal" +label = "postal_code" +countries = ["ES"] +languages = ["es"] +context = [ + "código postal", + "codigo postal", + "cp", + "c.p.", + "dirección postal", +] + +[[variants]] +regex = '\b(0[1-9]|[1-4]\d|5[0-2])\d{3}\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/es/finance/cif.toml b/crates/nvisy-pattern/assets/patterns/es/finance/cif.toml new file mode 100644 index 00000000..6344a137 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/es/finance/cif.toml @@ -0,0 +1,21 @@ +# Spanish CIF (Código de Identificación Fiscal): entity-class +# letter + 7 digits + control char (digit or letter depending on +# entity class). Now formally subsumed by NIF for new entities, +# but legacy IDs remain in wide circulation. + +name = "es-cif" +label = "company_id" +countries = ["ES"] +languages = ["es"] +context = [ + "código de identificación fiscal", + "cif", + "identificación fiscal", + "sociedad", + "empresa", +] + +[[variants]] +regex = '\b[ABCDEFGHJNPQRSUVW]-?\d{7}-?[0-9A-J]\b' +score = 0.5 +validator = "es.cif" diff --git a/crates/nvisy-pattern/assets/patterns/es/identity/nie.toml b/crates/nvisy-pattern/assets/patterns/es/identity/nie.toml new file mode 100644 index 00000000..d4b8df28 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/es/identity/nie.toml @@ -0,0 +1,21 @@ +# Spanish NIE (Número de Identidad de Extranjero): X, Y, or Z +# prefix + 7 digits + Mod 23 control letter (X→0, Y→1, Z→2 before +# the modulo). + +name = "es-nie" +label = "government_id" +countries = ["ES"] +languages = ["es"] +context = [ + "número de identidad de extranjero", + "número de identificación de extranjero", + "nie", + "tarjeta de residencia", + "residencia", + "extranjero", +] + +[[variants]] +regex = '\b[XYZ]-?\d{7}-?[A-Z]\b' +score = 0.5 +validator = "es.nie" diff --git a/crates/nvisy-pattern/assets/patterns/es/identity/nif.toml b/crates/nvisy-pattern/assets/patterns/es/identity/nif.toml new file mode 100644 index 00000000..629dd738 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/es/identity/nif.toml @@ -0,0 +1,22 @@ +# Spanish NIF / DNI: 8 digits + Mod 23 control letter. The legacy +# DNI (citizens) and NIF (taxpayers) share the algorithm; the +# leading `0` is sometimes omitted, so 7 digits + letter is also +# accepted by the validator. + +name = "es-nif" +label = "government_id" +countries = ["ES"] +languages = ["es"] +context = [ + "documento nacional de identidad", + "número de identificación fiscal", + "dni", + "nif", + "identificación", + "identidad", +] + +[[variants]] +regex = '\b\d{7,8}-?[A-Z]\b' +score = 0.5 +validator = "es.nif" diff --git a/crates/nvisy-pattern/assets/patterns/es/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/es/identity/passport.toml new file mode 100644 index 00000000..c02c4686 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/es/identity/passport.toml @@ -0,0 +1,18 @@ +# Spanish passport: 3 letters + 6 digits (e.g. AAA123456). No +# published checksum, so the score stays low and relies on the +# context-keyword boost. + +name = "es-passport" +label = "passport_number" +countries = ["ES"] +languages = ["es"] +context = [ + "pasaporte", + "número de pasaporte", + "passport", + "passport number", +] + +[[variants]] +regex = '\b[A-Z]{3}\d{6}\b' +score = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/fi/identity/hetu.toml b/crates/nvisy-pattern/assets/patterns/fi/identity/hetu.toml new file mode 100644 index 00000000..885c1c53 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/fi/identity/hetu.toml @@ -0,0 +1,24 @@ +# Finnish Henkilötunnus (HETU): 11 chars encoding birth date, +# century separator, serial, and control character. Issued by +# Digi- ja väestötietovirasto (DVV). The control character is +# validated by `fi.hetu` (mod-31 lookup into +# `0123456789ABCDEFHJKLMNPRSTUVWXY`). + +name = "fi-hetu" +label = "government_id" +countries = ["FI"] +languages = ["fi", "sv"] +context = [ + "henkilötunnus", + "hetu", + "personbeteckning", + "personbeteckningen", + "personal identity code", + "social security number", + "dvv", +] + +[[variants]] +regex = '\b\d{6}[+\-A-FYXWVU]\d{3}[0-9A-FHJKLMNPRSTUVWXY]\b' +score = 0.5 +validator = "fi.hetu" diff --git a/crates/nvisy-pattern/assets/patterns/in/finance/gstin.toml b/crates/nvisy-pattern/assets/patterns/in/finance/gstin.toml new file mode 100644 index 00000000..c0dfc3f5 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/finance/gstin.toml @@ -0,0 +1,20 @@ +# Indian Goods and Services Tax Identification Number (GSTIN): +# 15 chars: state code (01-37) + 10-char PAN + registration +# sequence + `Z` literal + base-36 weighted check digit. + +name = "in-gstin" +label = "tax_id" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "gstin", + "gst", + "goods and services tax", + "gst number", + "gst registration", +] + +[[variants]] +regex = '\b(0[1-9]|[12]\d|3[0-7])[A-Z]{5}\d{4}[A-Z][A-Z0-9]Z[A-Z0-9]\b' +score = 0.6 +validator = "in.gstin" diff --git a/crates/nvisy-pattern/assets/patterns/in/identity/aadhaar.toml b/crates/nvisy-pattern/assets/patterns/in/identity/aadhaar.toml new file mode 100644 index 00000000..ff80b08f --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/identity/aadhaar.toml @@ -0,0 +1,29 @@ +# Indian Aadhaar number: 12-digit ID issued by UIDAI. Leading +# digit ≥ 2 (UIDAI reserves 0xx and 1xx); the number must not be +# a palindrome; Verhoeff checksum. + +name = "in-aadhaar" +label = "government_id" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "aadhaar", + "uidai", + "unique identification authority", + "आधार", +] + +[[variants]] +regex = '\b[2-9]\d{3}\s\d{4}\s\d{4}\b' +score = 0.4 +validator = "in.aadhaar" + +[[variants]] +regex = '\b[2-9]\d{3}-\d{4}-\d{4}\b' +score = 0.4 +validator = "in.aadhaar" + +[[variants]] +regex = '\b[2-9]\d{11}\b' +score = 0.2 +validator = "in.aadhaar" diff --git a/crates/nvisy-pattern/assets/patterns/in/identity/pan.toml b/crates/nvisy-pattern/assets/patterns/in/identity/pan.toml new file mode 100644 index 00000000..21bffa69 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/identity/pan.toml @@ -0,0 +1,19 @@ +# Indian Permanent Account Number (PAN): 10-char alphanumeric +# `AAAAA9999A` issued by the Income Tax Department. The 4th +# character encodes the entity type per IT Department spec. + +name = "in-pan" +label = "tax_id" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "pan", + "permanent account number", + "income tax", + "pan card", +] + +[[variants]] +regex = '\b[A-Z]{3}[ABCFGHJLPTabcfghjlpt][A-Z]\d{4}[A-Z]\b' +score = 0.5 +validator = "in.pan" diff --git a/crates/nvisy-pattern/assets/patterns/in/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/in/identity/passport.toml new file mode 100644 index 00000000..cbc51b8b --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/identity/passport.toml @@ -0,0 +1,19 @@ +# Indian passport: 8-char alphanumeric in the form +# `LDDDDDDDD` where the first char is a letter, position 2 is a +# non-zero digit, and the last is a non-zero check digit. No +# published checksum. + +name = "in-passport" +label = "passport_number" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "passport", + "indian passport", + "passport number", + "ministry of external affairs", +] + +[[variants]] +regex = '\b[A-Z][1-9]\d\s?\d{4}[1-9]\b' +score = 0.2 diff --git a/crates/nvisy-pattern/assets/patterns/in/identity/voter.toml b/crates/nvisy-pattern/assets/patterns/in/identity/voter.toml new file mode 100644 index 00000000..94630fae --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/identity/voter.toml @@ -0,0 +1,25 @@ +# Indian Voter ID (EPIC — Elector's Photo Identity Card): +# 10-char alphanumeric issued by the Election Commission of +# India. Two formats: 3 letters + 7 digits (older), or 1 letter +# + restricted-set 2nd char + letter + 7 digits. + +name = "in-voter" +label = "government_id" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "voter", + "voter id", + "epic", + "elector photo identity card", + "election commission", + "मतदाता", +] + +[[variants]] +regex = '\b[A-Z][ABCDGHJKMNPRSY][A-Z]\d{7}\b' +score = 0.4 + +[[variants]] +regex = '\b[A-Z]{3}\d{7}\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/in/vehicle/registration.toml b/crates/nvisy-pattern/assets/patterns/in/vehicle/registration.toml new file mode 100644 index 00000000..28f0b077 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/in/vehicle/registration.toml @@ -0,0 +1,31 @@ +# Indian vehicle registration: state code (2 letters) + RTO +# district (2 digits) + series (1-3 letters) + serial (4 digits). +# Per-state district codes are not validated here — relying on +# the canonical structural pattern + RTO/registration context. + +name = "in-vehicle-registration" +label = "license_plate" +countries = ["IN"] +languages = ["en", "hi"] +context = [ + "rto", + "vehicle", + "registration plate", + "vehicle number", + "regional transport office", +] + +# Canonical: SS-DD-LL-NNNN. +[[variants]] +regex = '\b[A-Z]{2}\d{2}[A-Z]{1,2}\d{4}\b' +score = 0.5 + +# Older single-letter series: SS-D-LLL-NNNN. +[[variants]] +regex = '\b[A-Z]{2}\d[A-Z]{1,3}\d{4}\b' +score = 0.4 + +# Diplomatic / armed forces prefixes. +[[variants]] +regex = '\b\d{1,3}(CD|CC|UN)[1-9]\d{1,3}\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/it/finance/vat_code.toml b/crates/nvisy-pattern/assets/patterns/it/finance/vat_code.toml new file mode 100644 index 00000000..526e7fe0 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/it/finance/vat_code.toml @@ -0,0 +1,20 @@ +# Italian Partita IVA (P.IVA): 11 digits with a Luhn-like +# checksum. Validator rejects the all-zero sentinel. + +name = "it-vat-code" +label = "tax_id" +countries = ["IT"] +languages = ["it"] +context = [ + "partita iva", + "piva", + "p.iva", + "p. iva", + "numero iva", + "agenzia delle entrate", +] + +[[variants]] +regex = '\b\d{11}\b' +score = 0.3 +validator = "it.vat_code" diff --git a/crates/nvisy-pattern/assets/patterns/it/identity/driving_licence.toml b/crates/nvisy-pattern/assets/patterns/it/identity/driving_licence.toml new file mode 100644 index 00000000..06e806bd --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/it/identity/driving_licence.toml @@ -0,0 +1,22 @@ +# Italian patente di guida: 2 letters + 7 digits + 1 letter +# (classic format) or `U1` + 7 alnum + 1 letter (Motorizzazione +# Civile format issued post-2013). + +name = "it-driving-licence" +label = "drivers_license" +countries = ["IT"] +languages = ["it"] +context = [ + "patente di guida", + "patente", + "motorizzazione civile", + "numero patente", +] + +[[variants]] +regex = '(?i)\b[A-Z]{2}\d{7}[A-Z]\b' +score = 0.2 + +[[variants]] +regex = '(?i)\bU1[A-HJ-NP-Z0-9]{7}[A-Z]\b' +score = 0.3 diff --git a/crates/nvisy-pattern/assets/patterns/it/identity/fiscal_code.toml b/crates/nvisy-pattern/assets/patterns/it/identity/fiscal_code.toml new file mode 100644 index 00000000..426d0881 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/it/identity/fiscal_code.toml @@ -0,0 +1,23 @@ +# Italian Codice Fiscale (CF): 16 alphanumeric chars encoding +# surname/name initials, birth date, municipality, and control +# letter. The regex covers omocodia (digit→letter rewrites used +# when two people would collide) and structural sanity (month +# letter, day range, sex offset). Final validation by checksum +# in `it.fiscal_code`. + +name = "it-fiscal-code" +label = "tax_id" +countries = ["IT"] +languages = ["it"] +context = [ + "codice fiscale", + "cf", + "c.f.", + "tessera sanitaria", + "agenzia delle entrate", +] + +[[variants]] +regex = '''(?i)\b(?:[A-Z][AEIOU][AEIOUX]|[AEIOU]X{2}|[B-DF-HJ-NP-TV-Z]{2}[A-Z]){2}(?:[\dLMNP-V]{2}(?:[A-EHLMPR-T](?:[04LQ][1-9MNP-V]|[15MR][\dLMNP-V]|[26NS][0-8LMNP-U])|[DHPS][37PT][0L]|[ACELMRT][37PT][01LM]|[AC-EHLMPR-T][26NS][9V])|(?:[02468LNQSU][048LQU]|[13579MPRTV][26NS])B[26NS][9V])(?:[A-MZ][1-9MNP-V][\dLMNP-V]{2}|[A-M][0L](?:[1-9MNP-V][\dLMNP-V]|[0L][1-9MNP-V]))[A-Z]\b''' +score = 0.4 +validator = "it.fiscal_code" diff --git a/crates/nvisy-pattern/assets/patterns/it/identity/identity_card.toml b/crates/nvisy-pattern/assets/patterns/it/identity/identity_card.toml new file mode 100644 index 00000000..ec22523c --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/it/identity/identity_card.toml @@ -0,0 +1,33 @@ +# Italian Carta d'Identità: paper-based (`AA0000000`), +# Carta d'Identità Elettronica (CIE) 2.0 (`0000000AA`), and CIE +# 3.0 (`AA00000AA`). No published checksum, so scores stay weak +# and rely on the context-keyword boost. + +name = "it-identity-card" +label = "government_id" +countries = ["IT"] +languages = ["it"] +context = [ + "carta d'identità", + "carta di identità", + "carta identità", + "cie", + "documento di identità", + "documento di riconoscimento", + "espatrio", +] + +# Paper-based (issued through 2018, still valid until expiry). +[[variants]] +regex = '(?i)\b[A-Z]{2}\s?\d{7}\b' +score = 0.05 + +# CIE 2.0. +[[variants]] +regex = '(?i)\b\d{7}[A-Z]{2}\b' +score = 0.05 + +# CIE 3.0 (current standard). +[[variants]] +regex = '(?i)\b[A-Z]{2}\d{5}[A-Z]{2}\b' +score = 0.05 diff --git a/crates/nvisy-pattern/assets/patterns/it/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/it/identity/passport.toml new file mode 100644 index 00000000..75356c3f --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/it/identity/passport.toml @@ -0,0 +1,19 @@ +# Italian passport: 2 letters + 7 digits (Polizia di Stato +# format). No published checksum, so the score stays weak. + +name = "it-passport" +label = "passport_number" +countries = ["IT"] +languages = ["it"] +context = [ + "passaporto", + "passaporto italiano", + "passaporto elettronico", + "documento di viaggio", + "polizia di stato", + "dogana", +] + +[[variants]] +regex = '(?i)\b[A-Z]{2}\d{7}\b' +score = 0.05 diff --git a/crates/nvisy-pattern/assets/patterns/kr/finance/brn.toml b/crates/nvisy-pattern/assets/patterns/kr/finance/brn.toml new file mode 100644 index 00000000..2e5e4906 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/kr/finance/brn.toml @@ -0,0 +1,31 @@ +# Korean Business Registration Number (BRN): 10 digits formatted +# as `AAA-BB-CCCCC` (district office + business type + serial + +# check digit). Magic-keys Mod-10 checksum per National Tax +# Service spec. + +name = "kr-brn" +label = "company_id" +countries = ["KR"] +languages = ["ko"] +context = [ + "brn", + "사업자등록번호", + "사업자번호", + "사업자", + "business registration number", + "korean brn", + "business number", + "tax registration number", +] + +# Hyphen-separated rendering. +[[variants]] +regex = '\b\d{3}-\d{2}-\d{5}\b' +score = 0.5 +validator = "kr.brn" + +# Glued rendering. +[[variants]] +regex = '\b\d{10}\b' +score = 0.2 +validator = "kr.brn" diff --git a/crates/nvisy-pattern/assets/patterns/kr/identity/driver_license.toml b/crates/nvisy-pattern/assets/patterns/kr/identity/driver_license.toml new file mode 100644 index 00000000..87e3c333 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/kr/identity/driver_license.toml @@ -0,0 +1,28 @@ +# Korean driver's license number: 12 digits formatted as +# `AA-BB-CCCCCC-DD` (regional office + year + serial + check). +# Region code is validated against the published Doro-gyotongan +# list by `kr.driver_license`; the check digit algorithm itself +# is not publicly disclosed. + +name = "kr-driver-license" +label = "drivers_license" +countries = ["KR"] +languages = ["ko"] +context = [ + "운전면허", + "운전면허번호", + "면허번호", + "korean driver license", + "korean driver's license", + "driver license number", +] + +[[variants]] +regex = '\b\d{2}-\d{2}-\d{6}-\d{2}\b' +score = 0.4 +validator = "kr.driver_license" + +[[variants]] +regex = '\b\d{12}\b' +score = 0.2 +validator = "kr.driver_license" diff --git a/crates/nvisy-pattern/assets/patterns/kr/identity/frn.toml b/crates/nvisy-pattern/assets/patterns/kr/identity/frn.toml new file mode 100644 index 00000000..347cb7bc --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/kr/identity/frn.toml @@ -0,0 +1,27 @@ +# Korean Foreigner Registration Number (FRN): same shape as RRN +# but the gender/century digit is in `[5-8]` and the checksum +# formula adds 13 instead of 11. + +name = "kr-frn" +label = "government_id" +countries = ["KR"] +languages = ["ko"] +context = [ + "frn", + "외국인등록번호", + "외국인번호", + "foreigner registration number", + "korean frn", +] + +# Hyphen-separated rendering. +[[variants]] +regex = '\b\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])-[5-8]\d{6}\b' +score = 0.5 +validator = "kr.frn" + +# Glued rendering. +[[variants]] +regex = '\b\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])[5-8]\d{6}\b' +score = 0.4 +validator = "kr.frn" diff --git a/crates/nvisy-pattern/assets/patterns/kr/identity/passport.toml b/crates/nvisy-pattern/assets/patterns/kr/identity/passport.toml new file mode 100644 index 00000000..06b27aaa --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/kr/identity/passport.toml @@ -0,0 +1,27 @@ +# Korean passport number. Current format: a leading letter from +# `M/S/R/O/D` followed by 3 digits, one letter, and 4 digits. +# Previous format: leading letter + 8 digits. No published +# checksum. + +name = "kr-passport" +label = "passport_number" +countries = ["KR"] +languages = ["ko"] +context = [ + "passport", + "passport number", + "korean passport", + "여권", + "대한민국 여권", + "여권번호", +] + +# Current format `LDDDLDDDD`. +[[variants]] +regex = '\b[MSRODmsrod]\d{3}[A-Za-z]\d{4}\b' +score = 0.2 + +# Legacy 9-character format `LDDDDDDDD`. +[[variants]] +regex = '\b[MSRODmsrod]\d{8}\b' +score = 0.1 diff --git a/crates/nvisy-pattern/assets/patterns/kr/identity/rrn.toml b/crates/nvisy-pattern/assets/patterns/kr/identity/rrn.toml new file mode 100644 index 00000000..2efc9288 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/kr/identity/rrn.toml @@ -0,0 +1,29 @@ +# Korean Resident Registration Number (RRN): 13 digits encoding +# birth date, gender/century, region, serial, and weighted Mod-11 +# check digit. Format `YYMMDD-GHIJKLX`. Pre-October 2020 numbers +# carry the checksum; later issues use a random tail. + +name = "kr-rrn" +label = "government_id" +countries = ["KR"] +languages = ["ko"] +context = [ + "rrn", + "주민등록번호", + "주민번호", + "korean rrn", + "korean resident registration number", + "resident registration number", +] + +# Hyphen-separated rendering. +[[variants]] +regex = '\b\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])-[1-4]\d{6}\b' +score = 0.5 +validator = "kr.rrn" + +# Glued rendering. +[[variants]] +regex = '\b\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])[1-4]\d{6}\b' +score = 0.4 +validator = "kr.rrn" diff --git a/crates/nvisy-pattern/assets/patterns/ng/identity/nin.toml b/crates/nvisy-pattern/assets/patterns/ng/identity/nin.toml new file mode 100644 index 00000000..01cbf301 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/ng/identity/nin.toml @@ -0,0 +1,22 @@ +# Nigerian National Identification Number (NIN): 11-digit ID +# issued by the National Identity Management Commission (NIMC). +# The trailing digit is a Verhoeff checksum. + +name = "ng-nin" +label = "government_id" +countries = ["NG"] +languages = ["en"] +context = [ + "nin", + "national identification number", + "national identity number", + "nimc", + "national identity", + "nigeria id", + "nigerian identification", +] + +[[variants]] +regex = '\b\d{11}\b' +score = 0.3 +validator = "ng.nin" diff --git a/crates/nvisy-pattern/assets/patterns/ng/vehicle/registration.toml b/crates/nvisy-pattern/assets/patterns/ng/vehicle/registration.toml new file mode 100644 index 00000000..b105b6e1 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/ng/vehicle/registration.toml @@ -0,0 +1,20 @@ +# Nigerian vehicle registration plate (2011-present format): +# 3-letter LGA (Local Government Area) code + 3 digits + 2-letter +# year/batch code. Optional hyphen or space separator. + +name = "ng-vehicle-registration" +label = "license_plate" +countries = ["NG"] +languages = ["en"] +context = [ + "plate number", + "vehicle registration", + "license plate", + "number plate", + "frsc", + "federal road safety corps", +] + +[[variants]] +regex = '\b[A-Z]{3}[- ]?\d{3}[A-Z]{2}\b' +score = 0.5 diff --git a/crates/nvisy-pattern/assets/patterns/pl/contact/kod_pocztowy.toml b/crates/nvisy-pattern/assets/patterns/pl/contact/kod_pocztowy.toml new file mode 100644 index 00000000..e9a0e647 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/pl/contact/kod_pocztowy.toml @@ -0,0 +1,18 @@ +# Polish kod pocztowy: NN-NNN format introduced by Poczta Polska +# in 1973. The first two digits encode the postal region (00-99, +# all values are assigned). + +name = "pl-kod-pocztowy" +label = "postal_code" +countries = ["PL"] +languages = ["pl"] +context = [ + "kod pocztowy", + "kod", + "poczta polska", + "adres pocztowy", +] + +[[variants]] +regex = '\b\d{2}-\d{3}\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/pl/finance/nip.toml b/crates/nvisy-pattern/assets/patterns/pl/finance/nip.toml new file mode 100644 index 00000000..8bef2a45 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/pl/finance/nip.toml @@ -0,0 +1,31 @@ +# Polish NIP (Numer Identyfikacji Podatkowej): 10-digit tax ID +# with mod-11 weighted checksum per Ustawa o zasadach ewidencji i +# identyfikacji podatników. The regex accepts the canonical +# `XXX-XXX-XX-XX` and `XXX-XX-XX-XXX` separator renderings. + +name = "pl-nip" +label = "tax_id" +countries = ["PL"] +languages = ["pl"] +context = [ + "nip", + "numer identyfikacji podatkowej", + "identyfikacja podatkowa", + "urząd skarbowy", + "ministerstwo finansów", +] + +[[variants]] +regex = '\b\d{10}\b' +score = 0.3 +validator = "pl.nip" + +[[variants]] +regex = '\b\d{3}-\d{3}-\d{2}-\d{2}\b' +score = 0.5 +validator = "pl.nip" + +[[variants]] +regex = '\b\d{3}-\d{2}-\d{2}-\d{3}\b' +score = 0.4 +validator = "pl.nip" diff --git a/crates/nvisy-pattern/assets/patterns/pl/finance/regon.toml b/crates/nvisy-pattern/assets/patterns/pl/finance/regon.toml new file mode 100644 index 00000000..b4cfa4ca --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/pl/finance/regon.toml @@ -0,0 +1,26 @@ +# Polish REGON (Rejestr Gospodarki Narodowej): 9-digit +# entity-level or 14-digit unit-level company registry number. +# Both forms use a weighted mod-11 checksum, validated by +# `pl.regon`. + +name = "pl-regon" +label = "company_id" +countries = ["PL"] +languages = ["pl"] +context = [ + "regon", + "numer regon", + "rejestr gospodarki narodowej", + "gus", + "główny urząd statystyczny", +] + +[[variants]] +regex = '\b\d{9}\b' +score = 0.3 +validator = "pl.regon" + +[[variants]] +regex = '\b\d{14}\b' +score = 0.4 +validator = "pl.regon" diff --git a/crates/nvisy-pattern/assets/patterns/pl/identity/pesel.toml b/crates/nvisy-pattern/assets/patterns/pl/identity/pesel.toml new file mode 100644 index 00000000..8daf03c5 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/pl/identity/pesel.toml @@ -0,0 +1,21 @@ +# Polish PESEL (Powszechny Elektroniczny System Ewidencji +# Ludności): 11 digits encoding birth date + sex + serial + +# weighted-mod-10 check digit. The regex enforces structural +# sanity (day 01-31, month range using +20/+40/+60/+80 +# Ergänzung). + +name = "pl-pesel" +label = "government_id" +countries = ["PL"] +languages = ["pl"] +context = [ + "pesel", + "numer pesel", + "numer ewidencyjny", + "ewidencja ludności", +] + +[[variants]] +regex = '\b\d{2}([02468][1-9]|[13579][012])(0[1-9]|1[0-9]|2[0-9]|3[01])\d{5}\b' +score = 0.4 +validator = "pl.pesel" diff --git a/crates/nvisy-pattern/assets/patterns/se/contact/postnummer.toml b/crates/nvisy-pattern/assets/patterns/se/contact/postnummer.toml new file mode 100644 index 00000000..b85ed9c4 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/se/contact/postnummer.toml @@ -0,0 +1,23 @@ +# Swedish postnummer: 5-digit postal code in `XXX XX` rendering +# (Postnord standard). Codes start in `10-99x` range; `00xxx` is +# reserved. + +name = "se-postnummer" +label = "postal_code" +countries = ["SE"] +languages = ["sv"] +context = [ + "postnummer", + "post", + "postnord", + "postadress", + "postal code", +] + +[[variants]] +regex = '\b[1-9]\d{2}\s\d{2}\b' +score = 0.5 + +[[variants]] +regex = '\b[1-9]\d{4}\b' +score = 0.2 diff --git a/crates/nvisy-pattern/assets/patterns/se/finance/organisationsnummer.toml b/crates/nvisy-pattern/assets/patterns/se/finance/organisationsnummer.toml new file mode 100644 index 00000000..298c5d12 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/se/finance/organisationsnummer.toml @@ -0,0 +1,21 @@ +# Swedish Organisationsnummer: 10 digits issued by Bolagsverket +# (Swedish Companies Registration Office). Third digit must be +# ≥ 2 to distinguish from personnummer; Luhn checksum. + +name = "se-organisationsnummer" +label = "company_id" +countries = ["SE"] +languages = ["sv"] +context = [ + "organisationsnummer", + "orgnr", + "org nr", + "org.nr", + "företagsnummer", + "bolagsverket", +] + +[[variants]] +regex = '\b\d{6}[-]?\d{4}\b' +score = 0.6 +validator = "se.organisationsnummer" diff --git a/crates/nvisy-pattern/assets/patterns/se/identity/personnummer.toml b/crates/nvisy-pattern/assets/patterns/se/identity/personnummer.toml new file mode 100644 index 00000000..033581ee --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/se/identity/personnummer.toml @@ -0,0 +1,29 @@ +# Swedish Personnummer: `YYMMDD-XXXX` (10-digit) or +# `YYYYMMDD-XXXX` (12-digit) personal identity number. +# Samordningsnummer adds 60 to the day field. Luhn checksum +# applies to the last 10 digits. + +name = "se-personnummer" +label = "government_id" +countries = ["SE"] +languages = ["sv"] +context = [ + "personnummer", + "svenskt personnummer", + "personlig identitet", + "samordningsnummer", + "ssn", + "personal identity number", +] + +# 12-digit form with optional dash/plus separator. +[[variants]] +regex = '\b\d{8}[+\-]?\d{4}\b' +score = 0.5 +validator = "se.personnummer" + +# 10-digit form with optional dash/plus separator. +[[variants]] +regex = '\b\d{6}[+\-]?\d{4}\b' +score = 0.5 +validator = "se.personnummer" diff --git a/crates/nvisy-pattern/assets/patterns/sg/contact/postal_code.toml b/crates/nvisy-pattern/assets/patterns/sg/contact/postal_code.toml new file mode 100644 index 00000000..f3fedd0e --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/sg/contact/postal_code.toml @@ -0,0 +1,18 @@ +# Singapore postal code: 6 digits where the first two encode the +# delivery sector (01-82 in current Singapore Post assignments). +# No published checksum. + +name = "sg-postal-code" +label = "postal_code" +countries = ["SG"] +languages = ["en"] +context = [ + "postal code", + "singapore post", + "singpost", + "mailing address", +] + +[[variants]] +regex = '\b(0[1-9]|[1-7]\d|8[0-2])\d{4}\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/sg/finance/uen.toml b/crates/nvisy-pattern/assets/patterns/sg/finance/uen.toml new file mode 100644 index 00000000..efb229ab --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/sg/finance/uen.toml @@ -0,0 +1,37 @@ +# Singapore Unique Entity Number (UEN) issued by ACRA. Three +# formats: +# +# - Format A: 9 chars (8 digits + check letter). +# - Format B: 10 chars (YYYY year + 4 digits + check letter). +# - Format C: 10 chars (T/S/R + 2 digits + 2-letter entity type +# + 4 digits + check letter). + +name = "sg-uen" +label = "company_id" +countries = ["SG"] +languages = ["en"] +context = [ + "uen", + "unique entity number", + "acra", + "business registration", + "company registration", +] + +# Format A. +[[variants]] +regex = '\b\d{8}[A-Z]\b' +score = 0.3 +validator = "sg.uen" + +# Format B (must start with a 4-digit year). +[[variants]] +regex = '\b\d{9}[A-Z]\b' +score = 0.3 +validator = "sg.uen" + +# Format C. +[[variants]] +regex = '\b[TSR]\d{2}[A-Z]{2}\d{4}[A-Z]\b' +score = 0.5 +validator = "sg.uen" diff --git a/crates/nvisy-pattern/assets/patterns/sg/identity/nric.toml b/crates/nvisy-pattern/assets/patterns/sg/identity/nric.toml new file mode 100644 index 00000000..6d681a1c --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/sg/identity/nric.toml @@ -0,0 +1,31 @@ +# Singapore NRIC / FIN: 9 characters — prefix letter (S/T for +# Singapore citizens or PRs, F/G/M for foreigners) + 7 digits + +# check letter. Validator applies the published weighted Mod-11 +# algorithm with prefix-specific letter tables. + +name = "sg-nric" +label = "government_id" +countries = ["SG"] +languages = ["en"] +context = [ + "nric", + "nric#", + "fin", + "fin#", + "national registration", + "national registration identity", + "ica", +] + +# Tight form anchored to the published prefix set. +[[variants]] +regex = '\b[STFGM]\d{7}[A-Z]\b' +score = 0.5 +validator = "sg.nric" + +# Loose form (any leading letter); relies on the validator's +# prefix check. +[[variants]] +regex = '\b[A-Z]\d{7}[A-Z]\b' +score = 0.2 +validator = "sg.nric" diff --git a/crates/nvisy-pattern/assets/patterns/th/contact/postal_code.toml b/crates/nvisy-pattern/assets/patterns/th/contact/postal_code.toml new file mode 100644 index 00000000..5043f5d3 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/th/contact/postal_code.toml @@ -0,0 +1,19 @@ +# Thai postal code (รหัสไปรษณีย์): 5 digits where the first two +# encode the province (10-96 in current Thailand Post +# assignments). No published checksum. + +name = "th-postal-code" +label = "postal_code" +countries = ["TH"] +languages = ["th"] +context = [ + "รหัสไปรษณีย์", + "ไปรษณีย์", + "ที่อยู่", + "postal code", + "thailand post", +] + +[[variants]] +regex = '\b(1\d|[2-8]\d|9[0-6])\d{3}\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/th/identity/national_id.toml b/crates/nvisy-pattern/assets/patterns/th/identity/national_id.toml new file mode 100644 index 00000000..64836d22 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/th/identity/national_id.toml @@ -0,0 +1,30 @@ +# Thai National ID (เลขประจำตัวประชาชน): 13-digit identifier +# issued by the Department of Provincial Administration. The +# first digit encodes the citizenship category (1-8) and the +# 13th is a weighted Mod-11 check digit. + +name = "th-national-id" +label = "government_id" +countries = ["TH"] +languages = ["th"] +context = [ + "เลขประจำตัวประชาชน", + "เลขบัตรประชาชน", + "บัตรประจำตัวประชาชน", + "national id", + "thai national id", + "thai id", + "id card", +] + +# Grouped rendering: 1-2345-67890-12-1. +[[variants]] +regex = '\b[1-8]-\d{4}-\d{5}-\d{2}-\d\b' +score = 0.5 +validator = "th.national_id" + +# Glued rendering. +[[variants]] +regex = '\b[1-8]\d{12}\b' +score = 0.3 +validator = "th.national_id" diff --git a/crates/nvisy-pattern/assets/patterns/tr/contact/posta_kodu.toml b/crates/nvisy-pattern/assets/patterns/tr/contact/posta_kodu.toml new file mode 100644 index 00000000..5a972255 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/tr/contact/posta_kodu.toml @@ -0,0 +1,18 @@ +# Turkish posta kodu: 5-digit postal code. The first two digits +# encode the province (01-81), matching the license-plate +# province code. + +name = "tr-posta-kodu" +label = "postal_code" +countries = ["TR"] +languages = ["tr"] +context = [ + "posta kodu", + "ptt", + "adres", + "postal code", +] + +[[variants]] +regex = '\b(0[1-9]|[1-7]\d|8[01])\d{3}\b' +score = 0.4 diff --git a/crates/nvisy-pattern/assets/patterns/tr/identity/tckn.toml b/crates/nvisy-pattern/assets/patterns/tr/identity/tckn.toml new file mode 100644 index 00000000..24de8129 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/tr/identity/tckn.toml @@ -0,0 +1,24 @@ +# Turkish T.C. Kimlik No (TCKN): 11-digit national identifier +# issued by Nüfus ve Vatandaşlık İşleri (NVI) with a two-step +# weighted checksum. Required on KVKK compliance forms. + +name = "tr-tckn" +label = "government_id" +countries = ["TR"] +languages = ["tr"] +context = [ + "tc kimlik", + "kimlik no", + "kimlik numarası", + "tckn", + "tc no", + "nüfus cüzdanı", + "national id", + "türk kimlik", + "turkish id", +] + +[[variants]] +regex = '\b[1-9]\d{10}\b' +score = 0.4 +validator = "tr.tckn" diff --git a/crates/nvisy-pattern/assets/patterns/tr/vehicle/license_plate.toml b/crates/nvisy-pattern/assets/patterns/tr/vehicle/license_plate.toml new file mode 100644 index 00000000..2d2af803 --- /dev/null +++ b/crates/nvisy-pattern/assets/patterns/tr/vehicle/license_plate.toml @@ -0,0 +1,28 @@ +# Turkish license plate (plaka): province code 01-81 + 1-3 +# letters (Turkish alphabet excludes Q, W, X) + 2-4 digits. +# Civilian format per Karayolları Trafik Kanunu Madde 23. + +name = "tr-license-plate" +label = "license_plate" +countries = ["TR"] +languages = ["tr"] +context = [ + "plaka", + "araç plakası", + "plaka numarası", + "kayıt plakası", + "taşıt plakası", + "tr plaka", + "license plate", + "number plate", +] + +# Space-separated rendering: `34 ABC 1234`. +[[variants]] +regex = '\b(0[1-9]|[1-7]\d|8[01])\s?[A-PR-VY-Z]{1,3}\s?\d{2,4}\b' +score = 0.4 + +# Hyphen-separated rendering: `34-ABC-1234`. +[[variants]] +regex = '\b(0[1-9]|[1-7]\d|8[01])-[A-PR-VY-Z]{1,3}-\d{2,4}\b' +score = 0.5 diff --git a/crates/nvisy-pattern/src/shipped/dictionaries/en.rs b/crates/nvisy-pattern/src/shipped/dictionaries/en.rs new file mode 100644 index 00000000..407311e6 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/dictionaries/en.rs @@ -0,0 +1,51 @@ +//! English-language dictionaries — terms written in English +//! and meant to fire on English documents. +//! +//! Demonyms, religion names, and language names translate +//! across locales (`American` → `américain`, `Catholic` → +//! `catolico`, `English` → `englisch`). The terms here are the +//! English forms; a non-English document needs its own locale +//! sub-module (`fr`, `es`, …) once those land. Runtime +//! filtering by `RecognizerInput.language` keeps these +//! dictionaries from firing when the caller asserts a +//! non-English document. + +use crate::{__shipped_dictionary as shipped_dictionary, Dictionary}; + +shipped_dictionary!( + /// Fiat currency names and ISO 4217 codes (USD, US Dollar, + /// EUR, Euro, …). Long-form names are English; ISO codes + /// happen to match in non-English text too but the dictionary + /// is scoped to `en` so the long-form bulk doesn't fire on + /// French / German documents. + fn currencies + from "../../../assets/dictionaries/en/finance/currencies.toml" + with csv "../../../assets/dictionaries/en/finance/currencies.csv" +); +shipped_dictionary!( + /// English names of human languages plus ISO 639-1 codes + /// (English, en, French, fr, …). + fn languages + from "../../../assets/dictionaries/en/personal/languages.toml" + with csv "../../../assets/dictionaries/en/personal/languages.csv" +); +shipped_dictionary!( + /// English demonyms and nationality terms (American, French, + /// Japanese, …). + fn nationalities + from "../../../assets/dictionaries/en/personal/nationalities.toml" + with text "../../../assets/dictionaries/en/personal/nationalities.txt" +); +shipped_dictionary!( + /// English names of religious affiliations (Christian, Muslim, + /// Buddhist, …). + fn religions + from "../../../assets/dictionaries/en/personal/religions.toml" + with text "../../../assets/dictionaries/en/personal/religions.txt" +); + +/// Every English-scoped built-in dictionary. +#[must_use] +pub fn all() -> Vec { + vec![currencies(), languages(), nationalities(), religions()] +} diff --git a/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs b/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs index 2190100e..b0c20b1b 100644 --- a/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs +++ b/crates/nvisy-pattern/src/shipped/dictionaries/mod.rs @@ -1,15 +1,23 @@ //! Built-in [`Dictionary`]s, embedded at compile time. //! -//! Accessors are grouped by region — `world::*` for universal -//! dictionaries; future country-specific dictionaries land in -//! `::*` sub-modules. Each pairs a TOML metadata sidecar -//! (`assets/dictionaries///*.toml`) with a term -//! source (`*.csv` for multi-column lists, `*.txt` for one-per-line), -//! merging them via [`Dictionary::metadata_from_toml`] + -//! [`crate::Term::from_csv`] / [`crate::Term::from_text`]. +//! Accessors are grouped by *scope*: +//! +//! - `world::*` — universal: brand names, ISO codes (crypto, +//! currencies) that work in any language. +//! - `::*` — locale-specific: terms written in `` +//! that translate when the document language changes +//! (`en::nationalities`, future `fr::nationalites`, etc.). +//! +//! Each pairs a TOML metadata sidecar +//! (`assets/dictionaries///*.toml`) with a term +//! source (`*.csv` for multi-column lists, `*.txt` for +//! one-per-line), merging them via +//! [`Dictionary::metadata_from_toml`] + [`crate::Term::from_csv`] / +//! [`crate::Term::from_text`]. //! //! [`Dictionary`]: crate::Dictionary +pub mod en; pub mod world; use crate::Dictionary; @@ -51,10 +59,12 @@ macro_rules! __shipped_dictionary { } /// Every built-in dictionary shipped by this crate, regardless of -/// region. +/// scope. #[must_use] pub fn all() -> Vec { - world::all() + let mut dicts = world::all(); + dicts.extend(en::all()); + dicts } #[cfg(test)] @@ -63,15 +73,8 @@ mod tests { #[test] fn every_shipped_dictionary_parses() { - let dicts = all(); - assert_eq!(dicts.len(), 5); - for dict in &dicts { + for dict in all() { assert!(!dict.terms.is_empty(), "{} has no terms", dict.name); } } - - #[test] - fn world_set_has_5_dictionaries() { - assert_eq!(world::all().len(), 5); - } } diff --git a/crates/nvisy-pattern/src/shipped/dictionaries/world.rs b/crates/nvisy-pattern/src/shipped/dictionaries/world.rs index 7504a4e0..08ab2a27 100644 --- a/crates/nvisy-pattern/src/shipped/dictionaries/world.rs +++ b/crates/nvisy-pattern/src/shipped/dictionaries/world.rs @@ -1,4 +1,9 @@ -//! Universal dictionaries — apply regardless of jurisdiction. +//! Universal dictionaries — apply regardless of jurisdiction or +//! language. +//! +//! These are brand-name lists whose terms transcend locale. +//! Cryptocurrency names (`Bitcoin`, `Ethereum`) and tickers +//! (`BTC`, `ETH`) are the same string in every language. use crate::{__shipped_dictionary as shipped_dictionary, Dictionary}; @@ -9,41 +14,9 @@ shipped_dictionary!( from "../../../assets/dictionaries/world/finance/cryptocurrencies.toml" with csv "../../../assets/dictionaries/world/finance/cryptocurrencies.csv" ); -shipped_dictionary!( - /// Fiat currency names and ISO 4217 codes (USD, US Dollar, - /// EUR, Euro, …). - fn currencies - from "../../../assets/dictionaries/world/finance/currencies.toml" - with csv "../../../assets/dictionaries/world/finance/currencies.csv" -); -shipped_dictionary!( - /// Human-language names and ISO 639 codes (English, en, - /// French, fr, …). - fn languages - from "../../../assets/dictionaries/world/personal/languages.toml" - with csv "../../../assets/dictionaries/world/personal/languages.csv" -); -shipped_dictionary!( - /// Demonyms and nationality terms (American, French, …). - fn nationalities - from "../../../assets/dictionaries/world/personal/nationalities.toml" - with text "../../../assets/dictionaries/world/personal/nationalities.txt" -); -shipped_dictionary!( - /// Religious affiliations (Christianity, Islam, …). - fn religions - from "../../../assets/dictionaries/world/personal/religions.toml" - with text "../../../assets/dictionaries/world/personal/religions.txt" -); /// Every world-scoped built-in dictionary. #[must_use] pub fn all() -> Vec { - vec![ - cryptocurrencies(), - currencies(), - languages(), - nationalities(), - religions(), - ] + vec![cryptocurrencies()] } diff --git a/crates/nvisy-pattern/src/shipped/patterns/au.rs b/crates/nvisy-pattern/src/shipped/patterns/au.rs new file mode 100644 index 00000000..44609694 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/au.rs @@ -0,0 +1,32 @@ +//! Australia — patterns scoped to AU jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// ABN — 11-digit Australian Business Number with mod-89 + /// weighted checksum. + fn abn from "../../../assets/patterns/au/finance/abn.toml" +); +shipped_pattern!( + /// ACN — 9-digit Australian Company Number with mod-10 + /// weighted checksum. + fn acn from "../../../assets/patterns/au/finance/acn.toml" +); +shipped_pattern!( + /// Medicare — 10-digit Australian Medicare card number + /// (prefix 2-6, mod-10 weighted check). + fn medicare from "../../../assets/patterns/au/health/medicare.toml" +); +shipped_pattern!( + /// TFN — 9-digit Australian Tax File Number with mod-11 + /// weighted checksum. + fn tfn from "../../../assets/patterns/au/identity/tfn.toml" +); + +/// Every AU-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![abn(), acn(), medicare(), tfn()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/ca.rs b/crates/nvisy-pattern/src/shipped/patterns/ca.rs new file mode 100644 index 00000000..41c28407 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/ca.rs @@ -0,0 +1,22 @@ +//! Canada — patterns scoped to CA jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// SIN — 9-digit Social Insurance Number with Luhn checksum + /// (first digit in `[1-7, 9]`). + fn sin from "../../../assets/patterns/ca/identity/sin.toml" +); +shipped_pattern!( + /// Canadian postal code — `A1A 1A1` (Canada Post Address + /// Standard letter alphabet). + fn postal_code from "../../../assets/patterns/ca/contact/postal_code.toml" +); + +/// Every CA-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![sin(), postal_code()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/de.rs b/crates/nvisy-pattern/src/shipped/patterns/de.rs new file mode 100644 index 00000000..b2532f4f --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/de.rs @@ -0,0 +1,87 @@ +//! Germany — patterns scoped to DE jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// Betriebsstättennummer (BSNR) — 9-digit clinic/practice ID. + fn bsnr from "../../../assets/patterns/de/health/bsnr.toml" +); +shipped_pattern!( + /// Lebenslange Arztnummer (LANR) — 9-digit physician ID with + /// KBV checksum. + fn lanr from "../../../assets/patterns/de/health/lanr.toml" +); +shipped_pattern!( + /// Krankenversichertennummer (KVNR) — statutory health + /// insurance ID per §290 SGB V. + fn health_insurance from "../../../assets/patterns/de/health/health_insurance.toml" +); +shipped_pattern!( + /// Personalausweisnummer — nPA (post-2010 ICAO) and legacy + /// `T`+8-digit ID card formats. + fn id_card from "../../../assets/patterns/de/identity/id_card.toml" +); +shipped_pattern!( + /// Reisepassnummer — ICAO Doc 9303 passport serial. + fn passport from "../../../assets/patterns/de/identity/passport.toml" +); +shipped_pattern!( + /// Rentenversicherungsnummer (RVNR/SVNR) — Deutsche + /// Rentenversicherung pension/social-security ID per VKVV § 4. + fn social_security from "../../../assets/patterns/de/identity/social_security.toml" +); +shipped_pattern!( + /// Steueridentifikationsnummer (Steuer-IdNr) — 11-digit + /// lifetime tax identifier with ISO 7064 Mod 11, 10 checksum. + fn tax_id from "../../../assets/patterns/de/identity/tax_id.toml" +); +shipped_pattern!( + /// Steuernummer — regional Finanzamt tax number. + fn tax_number from "../../../assets/patterns/de/identity/tax_number.toml" +); +shipped_pattern!( + /// Umsatzsteuer-Identifikationsnummer (USt-IdNr) — VAT + /// identification number. + fn vat_id from "../../../assets/patterns/de/identity/vat_id.toml" +); +shipped_pattern!( + /// Führerscheinnummer — EU-harmonized post-2013 driving + /// licence number. + fn fuehrerschein from "../../../assets/patterns/de/identity/fuehrerschein.toml" +); +shipped_pattern!( + /// Kfz-Kennzeichen — license plate (Unterscheidungszeichen + + /// Erkennungszeichen). + fn kfz from "../../../assets/patterns/de/vehicle/kfz.toml" +); +shipped_pattern!( + /// Postleitzahl (PLZ) — 5-digit postal code. + fn plz from "../../../assets/patterns/de/contact/plz.toml" +); +shipped_pattern!( + /// Handelsregisternummer — court-registered company ID with + /// `HRA`/`HRB` section prefix. + fn handelsregister from "../../../assets/patterns/de/finance/handelsregister.toml" +); + +/// Every DE-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + bsnr(), + lanr(), + health_insurance(), + id_card(), + passport(), + social_security(), + tax_id(), + tax_number(), + vat_id(), + fuehrerschein(), + kfz(), + plz(), + handelsregister(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/es.rs b/crates/nvisy-pattern/src/shipped/patterns/es.rs new file mode 100644 index 00000000..5495b4f5 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/es.rs @@ -0,0 +1,34 @@ +//! Spain — patterns scoped to ES jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// NIF / DNI — 8-digit national ID + Mod 23 letter. + fn nif from "../../../assets/patterns/es/identity/nif.toml" +); +shipped_pattern!( + /// NIE — foreign-resident ID with `X`/`Y`/`Z` prefix. + fn nie from "../../../assets/patterns/es/identity/nie.toml" +); +shipped_pattern!( + /// Spanish passport — 3 letters + 6 digits. + fn passport from "../../../assets/patterns/es/identity/passport.toml" +); +shipped_pattern!( + /// CIF — company tax ID with entity-class letter + 7 digits + + /// control char (digit or letter per class). + fn cif from "../../../assets/patterns/es/finance/cif.toml" +); +shipped_pattern!( + /// Código postal — 5-digit postal code (province 01-52 in + /// the leading pair). + fn codigo_postal from "../../../assets/patterns/es/contact/codigo_postal.toml" +); + +/// Every ES-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![nif(), nie(), passport(), cif(), codigo_postal()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/fi.rs b/crates/nvisy-pattern/src/shipped/patterns/fi.rs new file mode 100644 index 00000000..0cc0a62f --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/fi.rs @@ -0,0 +1,17 @@ +//! Finland — patterns scoped to FI jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// HETU — Finnish personal identity code with mod-31 control + /// character. + fn hetu from "../../../assets/patterns/fi/identity/hetu.toml" +); + +/// Every FI-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![hetu()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/in.rs b/crates/nvisy-pattern/src/shipped/patterns/in.rs new file mode 100644 index 00000000..f3d44dbb --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/in.rs @@ -0,0 +1,49 @@ +//! India — patterns scoped to IN jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// Aadhaar — 12-digit UIDAI ID with Verhoeff checksum and + /// no-palindrome rule. + fn aadhaar from "../../../assets/patterns/in/identity/aadhaar.toml" +); +shipped_pattern!( + /// PAN — 10-char Permanent Account Number with entity-type + /// letter at position 4. + fn pan from "../../../assets/patterns/in/identity/pan.toml" +); +shipped_pattern!( + /// GSTIN — 15-char Goods and Services Tax ID with base-36 + /// weighted check digit. + fn gstin from "../../../assets/patterns/in/finance/gstin.toml" +); +shipped_pattern!( + /// Indian passport — 8-char alphanumeric (letter + non-zero + /// digit + 5 digits + non-zero digit). + fn passport from "../../../assets/patterns/in/identity/passport.toml" +); +shipped_pattern!( + /// EPIC voter ID — 10-char alphanumeric issued by the + /// Election Commission of India. + fn voter from "../../../assets/patterns/in/identity/voter.toml" +); +shipped_pattern!( + /// Indian vehicle registration — state + RTO district + + /// series + 4-digit serial. + fn vehicle_registration from "../../../assets/patterns/in/vehicle/registration.toml" +); + +/// Every IN-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + aadhaar(), + pan(), + gstin(), + passport(), + voter(), + vehicle_registration(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/it.rs b/crates/nvisy-pattern/src/shipped/patterns/it.rs new file mode 100644 index 00000000..4ea10864 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/it.rs @@ -0,0 +1,43 @@ +//! Italy — patterns scoped to IT jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// Codice Fiscale — 16-character personal tax/health ID with + /// odd/even-mapping checksum, omocodia-aware. + fn fiscal_code from "../../../assets/patterns/it/identity/fiscal_code.toml" +); +shipped_pattern!( + /// Carta d'Identità — paper-based, CIE 2.0, and CIE 3.0 + /// renderings. + fn identity_card from "../../../assets/patterns/it/identity/identity_card.toml" +); +shipped_pattern!( + /// Italian passport — 2 letters + 7 digits (Polizia di Stato + /// format). + fn passport from "../../../assets/patterns/it/identity/passport.toml" +); +shipped_pattern!( + /// Patente di guida — classic + Motorizzazione Civile + /// `U1`-prefixed format. + fn driving_licence from "../../../assets/patterns/it/identity/driving_licence.toml" +); +shipped_pattern!( + /// Partita IVA (P.IVA) — 11-digit VAT identifier with + /// Luhn-like checksum. + fn vat_code from "../../../assets/patterns/it/finance/vat_code.toml" +); + +/// Every IT-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![ + fiscal_code(), + identity_card(), + passport(), + driving_licence(), + vat_code(), + ] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/kr.rs b/crates/nvisy-pattern/src/shipped/patterns/kr.rs new file mode 100644 index 00000000..2d0f9574 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/kr.rs @@ -0,0 +1,36 @@ +//! South Korea — patterns scoped to KR jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// RRN — 13-digit Resident Registration Number with + /// weighted Mod-11 checksum (pre-Oct 2020) or random tail. + fn rrn from "../../../assets/patterns/kr/identity/rrn.toml" +); +shipped_pattern!( + /// FRN — Foreigner Registration Number (RRN shape with + /// gender/century digit 5-8 and (13 - sum) Mod-10 checksum). + fn frn from "../../../assets/patterns/kr/identity/frn.toml" +); +shipped_pattern!( + /// BRN — Business Registration Number with magic-keys + /// Mod-10 checksum. + fn brn from "../../../assets/patterns/kr/finance/brn.toml" +); +shipped_pattern!( + /// Korean passport — current (`LDDDLDDDD`) and legacy + /// (`LDDDDDDDD`) formats. + fn passport from "../../../assets/patterns/kr/identity/passport.toml" +); +shipped_pattern!( + /// Driver's license — 12-digit with region-code allowlist. + fn driver_license from "../../../assets/patterns/kr/identity/driver_license.toml" +); + +/// Every KR-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![rrn(), frn(), brn(), passport(), driver_license()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/mod.rs b/crates/nvisy-pattern/src/shipped/patterns/mod.rs index e0a82402..86319d14 100644 --- a/crates/nvisy-pattern/src/shipped/patterns/mod.rs +++ b/crates/nvisy-pattern/src/shipped/patterns/mod.rs @@ -11,6 +11,20 @@ //! [`Regex`]: crate::Regex //! [`PatternRecognizer::build`]: crate::PatternRecognizer +pub mod au; +pub mod ca; +pub mod de; +pub mod es; +pub mod fi; +pub mod r#in; +pub mod it; +pub mod kr; +pub mod ng; +pub mod pl; +pub mod se; +pub mod sg; +pub mod th; +pub mod tr; pub mod uk; pub mod us; pub mod world; @@ -43,6 +57,20 @@ pub fn all() -> Vec { let mut out = world::all(); out.extend(us::all()); out.extend(uk::all()); + out.extend(de::all()); + out.extend(es::all()); + out.extend(it::all()); + out.extend(pl::all()); + out.extend(au::all()); + out.extend(ca::all()); + out.extend(fi::all()); + out.extend(se::all()); + out.extend(r#in::all()); + out.extend(kr::all()); + out.extend(sg::all()); + out.extend(tr::all()); + out.extend(ng::all()); + out.extend(th::all()); out } @@ -51,24 +79,14 @@ mod tests { use super::*; #[test] - fn every_shipped_pattern_parses() { - let patterns = all(); - assert_eq!(patterns.len(), 34); - } - - #[test] - fn world_set_has_18_patterns() { - assert_eq!(world::all().len(), 18); - } - - #[test] - fn us_set_has_10_patterns() { - assert_eq!(us::all().len(), 10); - } - - #[test] - fn uk_set_has_6_patterns() { - assert_eq!(uk::all().len(), 6); + fn every_shipped_pattern_has_variants() { + for pattern in all() { + assert!( + !pattern.variants.is_empty(), + "pattern `{}` has no variants", + pattern.name, + ); + } } #[test] @@ -113,4 +131,228 @@ mod tests { ); } } + + #[test] + fn de_patterns_are_country_scoped_to_de() { + for pattern in de::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["DE"], + "DE-scoped pattern `{}` must declare countries = [DE]", + pattern.name, + ); + } + } + + #[test] + fn es_patterns_are_country_scoped_to_es() { + for pattern in es::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["ES"], + "ES-scoped pattern `{}` must declare countries = [ES]", + pattern.name, + ); + } + } + + #[test] + fn it_patterns_are_country_scoped_to_it() { + for pattern in it::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["IT"], + "IT-scoped pattern `{}` must declare countries = [IT]", + pattern.name, + ); + } + } + + #[test] + fn pl_patterns_are_country_scoped_to_pl() { + for pattern in pl::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["PL"], + "PL-scoped pattern `{}` must declare countries = [PL]", + pattern.name, + ); + } + } + + #[test] + fn au_patterns_are_country_scoped_to_au() { + for pattern in au::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["AU"], + "AU-scoped pattern `{}` must declare countries = [AU]", + pattern.name, + ); + } + } + + #[test] + fn ca_patterns_are_country_scoped_to_ca() { + for pattern in ca::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["CA"], + "CA-scoped pattern `{}` must declare countries = [CA]", + pattern.name, + ); + } + } + + #[test] + fn fi_patterns_are_country_scoped_to_fi() { + for pattern in fi::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["FI"], + "FI-scoped pattern `{}` must declare countries = [FI]", + pattern.name, + ); + } + } + + #[test] + fn se_patterns_are_country_scoped_to_se() { + for pattern in se::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["SE"], + "SE-scoped pattern `{}` must declare countries = [SE]", + pattern.name, + ); + } + } + + #[test] + fn in_patterns_are_country_scoped_to_in() { + for pattern in r#in::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["IN"], + "IN-scoped pattern `{}` must declare countries = [IN]", + pattern.name, + ); + } + } + + #[test] + fn kr_patterns_are_country_scoped_to_kr() { + for pattern in kr::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["KR"], + "KR-scoped pattern `{}` must declare countries = [KR]", + pattern.name, + ); + } + } + + #[test] + fn sg_patterns_are_country_scoped_to_sg() { + for pattern in sg::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["SG"], + "SG-scoped pattern `{}` must declare countries = [SG]", + pattern.name, + ); + } + } + + #[test] + fn tr_patterns_are_country_scoped_to_tr() { + for pattern in tr::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["TR"], + "TR-scoped pattern `{}` must declare countries = [TR]", + pattern.name, + ); + } + } + + #[test] + fn ng_patterns_are_country_scoped_to_ng() { + for pattern in ng::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["NG"], + "NG-scoped pattern `{}` must declare countries = [NG]", + pattern.name, + ); + } + } + + #[test] + fn th_patterns_are_country_scoped_to_th() { + for pattern in th::all() { + assert_eq!( + pattern + .countries + .iter() + .map(|c| c.as_str()) + .collect::>(), + vec!["TH"], + "TH-scoped pattern `{}` must declare countries = [TH]", + pattern.name, + ); + } + } } diff --git a/crates/nvisy-pattern/src/shipped/patterns/ng.rs b/crates/nvisy-pattern/src/shipped/patterns/ng.rs new file mode 100644 index 00000000..fd2fc8b5 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/ng.rs @@ -0,0 +1,22 @@ +//! Nigeria — patterns scoped to NG jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// NIN — 11-digit National Identification Number with + /// Verhoeff checksum. + fn nin from "../../../assets/patterns/ng/identity/nin.toml" +); +shipped_pattern!( + /// Vehicle registration — 3-letter LGA + 3 digits + 2-letter + /// year/batch (current 2011+ format). + fn vehicle_registration from "../../../assets/patterns/ng/vehicle/registration.toml" +); + +/// Every NG-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![nin(), vehicle_registration()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/pl.rs b/crates/nvisy-pattern/src/shipped/patterns/pl.rs new file mode 100644 index 00000000..26e16989 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/pl.rs @@ -0,0 +1,32 @@ +//! Poland — patterns scoped to PL jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// PESEL — 11-digit personal ID with date + sex + serial + + /// weighted-mod-10 check digit. + fn pesel from "../../../assets/patterns/pl/identity/pesel.toml" +); +shipped_pattern!( + /// NIP — 10-digit taxpayer identification number, mod-11 + /// weighted checksum. + fn nip from "../../../assets/patterns/pl/finance/nip.toml" +); +shipped_pattern!( + /// REGON — 9 or 14-digit company registry number, mod-11 + /// weighted checksum. + fn regon from "../../../assets/patterns/pl/finance/regon.toml" +); +shipped_pattern!( + /// Kod pocztowy — `NN-NNN` postal code (Poczta Polska, + /// 1973-present). + fn kod_pocztowy from "../../../assets/patterns/pl/contact/kod_pocztowy.toml" +); + +/// Every PL-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![pesel(), nip(), regon(), kod_pocztowy()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/se.rs b/crates/nvisy-pattern/src/shipped/patterns/se.rs new file mode 100644 index 00000000..aa5f275a --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/se.rs @@ -0,0 +1,27 @@ +//! Sweden — patterns scoped to SE jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// Personnummer — 10/12-digit personal identity number with + /// date validity (incl. samordningsnummer) and Luhn checksum. + fn personnummer from "../../../assets/patterns/se/identity/personnummer.toml" +); +shipped_pattern!( + /// Organisationsnummer — 10-digit Bolagsverket company ID + /// with third digit ≥ 2 and Luhn checksum. + fn organisationsnummer from "../../../assets/patterns/se/finance/organisationsnummer.toml" +); +shipped_pattern!( + /// Postnummer — 5-digit postal code in `XXX XX` rendering + /// (Postnord standard). + fn postnummer from "../../../assets/patterns/se/contact/postnummer.toml" +); + +/// Every SE-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![personnummer(), organisationsnummer(), postnummer()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/sg.rs b/crates/nvisy-pattern/src/shipped/patterns/sg.rs new file mode 100644 index 00000000..62349df7 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/sg.rs @@ -0,0 +1,27 @@ +//! Singapore — patterns scoped to SG jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// NRIC / FIN — Singapore National Registration Identity + /// Card / Foreign Identification Number with weighted Mod-11 + /// checksum. + fn nric from "../../../assets/patterns/sg/identity/nric.toml" +); +shipped_pattern!( + /// UEN — Unique Entity Number issued by ACRA (formats A, B, + /// and C, each with its own checksum). + fn uen from "../../../assets/patterns/sg/finance/uen.toml" +); +shipped_pattern!( + /// Singapore postal code — 6-digit Singapore Post code. + fn postal_code from "../../../assets/patterns/sg/contact/postal_code.toml" +); + +/// Every SG-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![nric(), uen(), postal_code()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/th.rs b/crates/nvisy-pattern/src/shipped/patterns/th.rs new file mode 100644 index 00000000..48fb43d4 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/th.rs @@ -0,0 +1,23 @@ +//! Thailand — patterns scoped to TH jurisdictional formats. +//! +//! Algorithm sourced from the Department of Provincial +//! Administration spec (not from the Presidio bundle). + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// National ID (เลขประจำตัวประชาชน) — 13-digit ID with + /// weighted Mod-11 check digit; first digit 1-8. + fn national_id from "../../../assets/patterns/th/identity/national_id.toml" +); +shipped_pattern!( + /// Thai postal code (รหัสไปรษณีย์) — 5-digit Thailand Post + /// code with province prefix 10-96. + fn postal_code from "../../../assets/patterns/th/contact/postal_code.toml" +); + +/// Every TH-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![national_id(), postal_code()] +} diff --git a/crates/nvisy-pattern/src/shipped/patterns/tr.rs b/crates/nvisy-pattern/src/shipped/patterns/tr.rs new file mode 100644 index 00000000..47615a86 --- /dev/null +++ b/crates/nvisy-pattern/src/shipped/patterns/tr.rs @@ -0,0 +1,27 @@ +//! Turkey — patterns scoped to TR jurisdictional formats. +//! +//! See `assets/PRESIDIO.md` for third-party attribution. + +use crate::{__shipped_pattern as shipped_pattern, Regex}; + +shipped_pattern!( + /// TCKN — 11-digit T.C. Kimlik No with two-step weighted + /// checksum. + fn tckn from "../../../assets/patterns/tr/identity/tckn.toml" +); +shipped_pattern!( + /// Turkish license plate (plaka) — space- and + /// hyphen-separated renderings, province codes 01-81. + fn license_plate from "../../../assets/patterns/tr/vehicle/license_plate.toml" +); +shipped_pattern!( + /// Turkish posta kodu — 5-digit postal code (province + /// prefix 01-81). + fn posta_kodu from "../../../assets/patterns/tr/contact/posta_kodu.toml" +); + +/// Every TR-scoped built-in pattern. +#[must_use] +pub fn all() -> Vec { + vec![tckn(), license_plate(), posta_kodu()] +} diff --git a/crates/nvisy-pattern/src/validators/au/abn.rs b/crates/nvisy-pattern/src/validators/au/abn.rs new file mode 100644 index 00000000..bbbfc0f1 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/au/abn.rs @@ -0,0 +1,71 @@ +//! Australian Business Number (ABN) validator. +//! +//! 11 digits issued by the Australian Business Register (ABR). +//! Algorithm: subtract 1 from the leading digit (wrap 0→9), +//! then weighted sum with `[10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]` +//! must be divisible by 89. + +const WEIGHTS: [u32; 11] = [10, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19]; + +/// Return `true` when `value` is a valid 11-digit ABN. Whitespace +/// and dash separators in the canonical `NN NNN NNN NNN` rendering +/// are stripped before validation. +pub fn abn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 11 || extras > 0 { + return false; + } + let mut adjusted = digits; + adjusted[0] = if adjusted[0] == 0 { 9 } else { adjusted[0] - 1 }; + let sum: u32 = adjusted.iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + sum.is_multiple_of(89) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_abn() { + // 51 824 753 556 — Australian Taxation Office sample. + assert!(abn("51824753556")); + } + + #[test] + fn accepts_with_separators() { + assert!(abn("51 824 753 556")); + assert!(abn("51-824-753-556")); + } + + #[test] + fn accepts_second_vector() { + // 53 004 085 616 — Telstra Corporation, public ABN. + assert!(abn("53004085616")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!abn("11000000000")); + assert!(!abn("51824753557")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!abn("5182475355")); + assert!(!abn("518247535566")); + assert!(!abn("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!abn("5182475355A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/au/acn.rs b/crates/nvisy-pattern/src/validators/au/acn.rs new file mode 100644 index 00000000..054fbec8 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/au/acn.rs @@ -0,0 +1,63 @@ +//! Australian Company Number (ACN) validator. +//! +//! 9 digits issued by ASIC. Algorithm: weighted sum of the first +//! 8 digits with `[8, 7, 6, 5, 4, 3, 2, 1]`; check digit equals +//! `(10 - sum mod 10) mod 10`. + +const WEIGHTS: [u32; 8] = [8, 7, 6, 5, 4, 3, 2, 1]; + +/// Return `true` when `value` is a valid 9-digit ACN. Whitespace +/// and dash separators in the canonical `NNN NNN NNN` rendering +/// are stripped before validation. +pub fn acn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 9 || extras > 0 { + return false; + } + let sum: u32 = digits[..8].iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + let check = (10 - sum % 10) % 10; + check == digits[8] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_acn() { + // Body 12345678 → check 0. + assert!(acn("123456780")); + } + + #[test] + fn accepts_with_separators() { + assert!(acn("123 456 780")); + assert!(acn("123-456-780")); + } + + #[test] + fn accepts_second_vector() { + // Body 00400000 → check 6. + assert!(acn("004000006")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!acn("123456789")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!acn("12345678")); + assert!(!acn("1234567890")); + assert!(!acn("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/au/medicare.rs b/crates/nvisy-pattern/src/validators/au/medicare.rs new file mode 100644 index 00000000..d78e4e1c --- /dev/null +++ b/crates/nvisy-pattern/src/validators/au/medicare.rs @@ -0,0 +1,76 @@ +//! Australian Medicare number validator. +//! +//! Card numbers are 10 or 11 digits where the first 9 form the +//! identifier (first digit in `[2..=6]`) followed by a check +//! digit; the trailing digit(s) encode the individual reference +//! and issue number (not part of the checksum). +//! +//! Algorithm: weighted sum of the first 8 digits with +//! `[1, 3, 7, 9, 1, 3, 7, 9]` mod 10 equals the 9th digit. + +const WEIGHTS: [u32; 8] = [1, 3, 7, 9, 1, 3, 7, 9]; + +/// Return `true` when the first 9 digits of `value` form a +/// Medicare number whose checksum matches. Whitespace and dash +/// separators are stripped before validation. Trailing digits +/// (individual reference + issue) are accepted but ignored. +pub fn medicare(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if !(9..=11).contains(&digits.len()) || extras > 0 { + return false; + } + if !(2..=6).contains(&digits[0]) { + return false; + } + let sum: u32 = digits[..8].iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + sum % 10 == digits[8] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_medicare() { + // Body 22281236 → check 6. + assert!(medicare("222812366")); + } + + #[test] + fn accepts_with_separators() { + assert!(medicare("2228 12366")); + assert!(medicare("2228 1236 6")); + } + + #[test] + fn accepts_with_individual_reference() { + // 10-digit form: 9-digit Medicare + individual reference. + assert!(medicare("2228123661")); + } + + #[test] + fn rejects_wrong_prefix() { + // Body must start with 2-6; 1xxx is invalid. + assert!(!medicare("122812366")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!medicare("222812360")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!medicare("22281236")); + assert!(!medicare("222812366111")); + assert!(!medicare("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/au/mod.rs b/crates/nvisy-pattern/src/validators/au/mod.rs new file mode 100644 index 00000000..efe67b81 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/au/mod.rs @@ -0,0 +1,17 @@ +//! Australia-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"au.abn"`, `"au.acn"`, `"au.medicare"`, +//! `"au.tfn"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod abn; +mod acn; +mod medicare; +mod tfn; + +pub use self::abn::abn; +pub use self::acn::acn; +pub use self::medicare::medicare; +pub use self::tfn::tfn; diff --git a/crates/nvisy-pattern/src/validators/au/tfn.rs b/crates/nvisy-pattern/src/validators/au/tfn.rs new file mode 100644 index 00000000..ae16cf13 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/au/tfn.rs @@ -0,0 +1,61 @@ +//! Australian Tax File Number (TFN) validator. +//! +//! 9 digits issued by the Australian Taxation Office. Algorithm: +//! weighted sum with `[1, 4, 3, 7, 5, 8, 6, 9, 10]` over all 9 +//! digits must be divisible by 11. + +const WEIGHTS: [u32; 9] = [1, 4, 3, 7, 5, 8, 6, 9, 10]; + +/// Return `true` when `value` is a valid 9-digit TFN. Whitespace +/// and dash separators in the canonical `NNN NNN NNN` rendering +/// are stripped before validation. +pub fn tfn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 9 || extras > 0 { + return false; + } + let sum: u32 = digits.iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + sum.is_multiple_of(11) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_tfn() { + // 123 456 782 — ATO documented test value. + assert!(tfn("123456782")); + } + + #[test] + fn accepts_with_separators() { + assert!(tfn("123 456 782")); + assert!(tfn("123-456-782")); + } + + #[test] + fn accepts_second_vector() { + assert!(tfn("100000001")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!tfn("123456789")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!tfn("12345678")); + assert!(!tfn("1234567823")); + assert!(!tfn("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/ca/mod.rs b/crates/nvisy-pattern/src/validators/ca/mod.rs new file mode 100644 index 00000000..e905d0c3 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/ca/mod.rs @@ -0,0 +1,10 @@ +//! Canada-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"ca.sin"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod sin; + +pub use self::sin::sin; diff --git a/crates/nvisy-pattern/src/validators/ca/sin.rs b/crates/nvisy-pattern/src/validators/ca/sin.rs new file mode 100644 index 00000000..aac70e62 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/ca/sin.rs @@ -0,0 +1,77 @@ +//! Canadian Social Insurance Number (SIN) validator. +//! +//! 9 digits with a Luhn check digit over the first 8. Numbers +//! beginning with `0` or `8` are reserved by Employment and +//! Social Development Canada (ESDC) and never assigned. + +/// Return `true` when `value` is a valid Canadian SIN. Whitespace +/// and dash separators in the canonical `NNN NNN NNN` / +/// `NNN-NNN-NNN` renderings are stripped before validation. +pub fn sin(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 9 || extras > 0 { + return false; + } + if digits[0] == 0 || digits[0] == 8 { + return false; + } + let mut sum: u32 = 0; + for (i, d) in digits.iter().rev().enumerate() { + if i.is_multiple_of(2) { + sum += d; + } else { + let m = d * 2; + sum += if m > 9 { m - 9 } else { m }; + } + } + sum.is_multiple_of(10) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_sin() { + // 123 456 782 — widely-quoted ESDC test value. + assert!(sin("123456782")); + } + + #[test] + fn accepts_with_separators() { + assert!(sin("123 456 782")); + assert!(sin("123-456-782")); + } + + #[test] + fn accepts_second_vector() { + assert!(sin("100000009")); + } + + #[test] + fn rejects_reserved_prefix() { + // 0xxxxxxxx and 8xxxxxxxx are reserved. + assert!(!sin("012345670")); + assert!(!sin("812345674")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!sin("123456789")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!sin("12345678")); + assert!(!sin("1234567823")); + assert!(!sin("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/bsnr.rs b/crates/nvisy-pattern/src/validators/de/bsnr.rs new file mode 100644 index 00000000..90ea239b --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/bsnr.rs @@ -0,0 +1,51 @@ +//! German Betriebsstättennummer (BSNR) structural validator. +//! +//! BSNR is a 9-digit practice/site-of-care number assigned by the +//! regional Kassenärztliche Vereinigung (KV). There is no +//! published Prüfziffer algorithm, so this validator only drops +//! obvious garbage (wrong length, non-digit, all-zero); the +//! `\b\d{9}\b` regex is too broad to promote a 2-digit prefix +//! whitelist into a high-confidence signal, so the upstream +//! `valid_kv_codes` table is left out — context keywords +//! ("Betriebsstättennummer", "Praxis", …) drive final confidence +//! via the enhancer. + +/// Return `true` when `value` is a structurally-plausible BSNR. +/// +/// Rejects: wrong length, non-digit characters, all-zero string. +pub fn bsnr(value: &str) -> bool { + let digits: String = value.chars().filter(|c| !c.is_ascii_whitespace()).collect(); + if digits.len() != 9 || !digits.chars().all(|c| c.is_ascii_digit()) { + return false; + } + !digits.chars().all(|c| c == '0') +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_plausible_shapes() { + assert!(bsnr("021234568")); + assert!(bsnr("381789045")); + assert!(bsnr("721234567")); + } + + #[test] + fn rejects_all_zero() { + assert!(!bsnr("000000000")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!bsnr("12345678")); + assert!(!bsnr("1234567890")); + assert!(!bsnr("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!bsnr("12345678A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/health_insurance.rs b/crates/nvisy-pattern/src/validators/de/health_insurance.rs new file mode 100644 index 00000000..2e6ca102 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/health_insurance.rs @@ -0,0 +1,81 @@ +//! German Krankenversicherungsnummer (KVNR) checksum validator. +//! +//! 10-character KVNR: leading letter + 8 digits + check digit. +//! GKV §290 SGB V Anlage 1 expands the letter to its 2-digit +//! 1-based ordinal (A→01, B→02, …, Z→26), concatenates with the +//! 8 data digits, weights the resulting 10 digits with the +//! alternating factors `[1,2,1,2,1,2,1,2,1,2]`, cross-sums any +//! product ≥ 10, and asserts the total mod 10 equals the check +//! digit. + +/// Return `true` when `value` is a valid 10-character KVNR +/// (letter + 9 digits) per GKV §290 SGB V. +pub fn health_insurance(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + let chars: Vec = trimmed.chars().collect(); + if chars.len() != 10 { + return false; + } + let Some(letter) = chars.first().copied() else { + return false; + }; + if !letter.is_ascii_uppercase() { + return false; + } + if !chars[1..].iter().all(|c| c.is_ascii_digit()) { + return false; + } + + // Letter → 2-digit ordinal (A=01, …, Z=26) then 8 data digits. + let letter_val = (letter as u32) - ('A' as u32) + 1; + let mut effective: Vec = vec![letter_val / 10, letter_val % 10]; + for c in chars.iter().skip(1).take(8) { + effective.push(c.to_digit(10).unwrap()); + } + + let weights = [1, 2, 1, 2, 1, 2, 1, 2, 1, 2]; + let total: u32 = effective + .iter() + .zip(weights) + .map(|(d, w)| { + let p = d * w; + if p >= 10 { (p / 10) + (p % 10) } else { p } + }) + .sum(); + + let check = chars[9].to_digit(10).unwrap(); + total % 10 == check +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_spec_example() { + // From § 290 SGB V Anlage 1, Stand 02.01.2023. + assert!(health_insurance("A000500015")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!health_insurance("A000500016")); + } + + #[test] + fn rejects_missing_letter_prefix() { + assert!(!health_insurance("0000500015")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!health_insurance("A00050001")); + assert!(!health_insurance("A0005000150")); + assert!(!health_insurance("")); + } + + #[test] + fn rejects_non_digit_payload() { + assert!(!health_insurance("A00050001A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/icao.rs b/crates/nvisy-pattern/src/validators/de/icao.rs new file mode 100644 index 00000000..9d2bf452 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/icao.rs @@ -0,0 +1,37 @@ +//! ICAO Doc 9303 check-digit helpers shared by German document +//! validators ([`passport`], [`id_card`]). +//! +//! Both nPA (Personalausweis since 2010) and German +//! Reisepass document numbers carry a 7-3-1 weighted check +//! digit at position 9, computed over the first 8 alphanumeric +//! characters with letters mapped `A`=10, `B`=11, …, `Z`=35. +//! +//! ICAO also restricts the serial-charset to letters excluding +//! `A`, `B`, `D`, `E`, `I`, `O`, `Q`, `S`, `U` (visually +//! ambiguous). Callers verify that themselves through the regex +//! character class; this helper only computes the checksum. +//! +//! [`passport`]: super::passport +//! [`id_card`]: super::id_card + +/// Compute the ICAO Doc 9303 check digit over an 8-character +/// alphanumeric serial. Returns `None` when a character is not +/// `0`–`9` or `A`–`Z`. +pub(super) fn mrz_check_digit(serial: &str) -> Option { + if serial.len() != 8 { + return None; + } + let weights = [7, 3, 1]; + let mut total: u32 = 0; + for (i, c) in serial.chars().enumerate() { + let value = if c.is_ascii_digit() { + c.to_digit(10).unwrap() + } else if c.is_ascii_uppercase() { + (c as u32) - ('A' as u32) + 10 + } else { + return None; + }; + total += value * weights[i % 3]; + } + Some(total % 10) +} diff --git a/crates/nvisy-pattern/src/validators/de/id_card.rs b/crates/nvisy-pattern/src/validators/de/id_card.rs new file mode 100644 index 00000000..63ca8ac5 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/id_card.rs @@ -0,0 +1,85 @@ +//! German Personalausweis (national ID card) validator. +//! +//! Two formats coexist on issued cards in 2026: +//! +//! - **nPA** (neuer Personalausweis), issued since November 2010: +//! 9 alphanumeric characters, ICAO Doc 9303 charset +//! (excludes A, B, D, E, I, O, Q, S, U) plus a trailing check +//! digit. The check digit uses 7-3-1 weights — same as +//! [`super::passport`]. +//! - **Legacy** `T`-prefix card, issued before 2010: letter `T` +//! followed by 8 digits. The trailing digit is part of the +//! serial, not a checksum. Accepted at pattern confidence +//! because there is no structural check to apply. + +use super::icao::mrz_check_digit; + +const FORBIDDEN_LETTERS: &str = "ABDEIOQSU"; + +/// Return `true` when `value` is a structurally-plausible +/// German Personalausweis number — either an nPA serial with a +/// valid ICAO Doc 9303 check digit, or a legacy `T` + 8-digit +/// number. +pub fn id_card(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + if trimmed.len() != 9 { + return false; + } + + // Legacy T-format: `T` followed by 8 digits. No checksum. + if trimmed.starts_with('T') && trimmed[1..].chars().all(|c| c.is_ascii_digit()) { + return true; + } + + // nPA: ICAO 7-3-1 over first 8 chars, no forbidden letters. + let (serial, check) = trimmed.split_at(8); + let Some(check_digit) = check.chars().next().and_then(|c| c.to_digit(10)) else { + return false; + }; + if serial.chars().any(|c| FORBIDDEN_LETTERS.contains(c)) { + return false; + } + mrz_check_digit(serial) == Some(check_digit) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_legacy_t_format() { + assert!(id_card("T22000124")); + // No checksum to verify, so any T+8-digit string passes. + assert!(id_card("T00000000")); + } + + #[test] + fn rejects_legacy_with_letter_payload() { + assert!(!id_card("T2200012A")); + } + + #[test] + fn accepts_npa_with_valid_check() { + // Serial `L01X00T44` — known nPA sample (legal text in + // PassG references). + assert!(id_card("L01X00T44")); + } + + #[test] + fn rejects_npa_with_forbidden_letter() { + // `B` is in the forbidden ICAO charset. + assert!(!id_card("LB1X00T44")); + } + + #[test] + fn rejects_npa_with_invalid_check() { + assert!(!id_card("L01X00T45")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!id_card("L01X00T4")); + assert!(!id_card("L01X00T440")); + assert!(!id_card("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/lanr.rs b/crates/nvisy-pattern/src/validators/de/lanr.rs new file mode 100644 index 00000000..9faee4c4 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/lanr.rs @@ -0,0 +1,67 @@ +//! German Lebenslange Arztnummer (LANR) checksum validator. +//! +//! 9-digit lifetime physician number. Check digit at position 7 +//! derives from positions 1–6 via the KBV Arztnummern-Richtlinie: +//! sum of `[4, 9, 4, 9, 4, 9]`-weighted digits, then the complement +//! to 10. Positions 8–9 carry the physician's Fachgruppe and are +//! not part of the checksum. + +/// Return `true` when `value` is a 9-digit LANR with a valid +/// position-7 check digit per KBV. +pub fn lanr(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace()) + .map(|c| c.to_digit(10)) + .collect::>>() + .unwrap_or_default(); + if digits.len() != 9 { + return false; + } + let total: u32 = digits + .iter() + .take(6) + .zip([4, 9, 4, 9, 4, 9]) + .map(|(d, w)| d * w) + .sum(); + let expected = (10 - total % 10) % 10; + digits[6] == expected +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn worked_example() { + // Per Presidio docstring: physician digits `123456` produce + // check digit 6, so 123456601 is a valid LANR. + assert!(lanr("123456601")); + } + + #[test] + fn additional_test_vectors() { + // sum 234567 * [4,9,4,9,4,9] = 8+27+16+45+24+63 = 183; 183 mod 10 = 3; check = 7. + assert!(lanr("234567701")); + // sum 100000 * [4,9,4,9,4,9] = 4; 10-4 = 6. + assert!(lanr("100000601")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!lanr("123456701")); + assert!(!lanr("123456501")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!lanr("12345670")); + assert!(!lanr("1234566010")); + assert!(!lanr("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!lanr("12345670A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/mod.rs b/crates/nvisy-pattern/src/validators/de/mod.rs new file mode 100644 index 00000000..2d76df63 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/mod.rs @@ -0,0 +1,29 @@ +//! Germany-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"de.bsnr"`, `"de.lanr"`, `"de.passport"`, +//! `"de.id_card"`, `"de.health_insurance"`, `"de.social_security"`, +//! `"de.tax_id"`, `"de.vat_id"`, `"de.plz"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod bsnr; +mod health_insurance; +mod icao; +mod id_card; +mod lanr; +mod passport; +mod plz; +mod social_security; +mod tax_id; +mod vat_id; + +pub use self::bsnr::bsnr; +pub use self::health_insurance::health_insurance; +pub use self::id_card::id_card; +pub use self::lanr::lanr; +pub use self::passport::passport; +pub use self::plz::plz; +pub use self::social_security::social_security; +pub use self::tax_id::tax_id; +pub use self::vat_id::vat_id; diff --git a/crates/nvisy-pattern/src/validators/de/passport.rs b/crates/nvisy-pattern/src/validators/de/passport.rs new file mode 100644 index 00000000..1b4289fd --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/passport.rs @@ -0,0 +1,67 @@ +//! German Reisepass (passport) ICAO Doc 9303 check-digit +//! validator. +//! +//! 9-character serial; the trailing digit is the check digit +//! computed via 7-3-1 weighting over the first 8 characters. +//! Letters `A`, `B`, `D`, `E`, `I`, `O`, `Q`, `S`, `U` are +//! visually ambiguous and never appear in ICAO travel-document +//! serials — reject outright so a lucky checksum can't promote a +//! non-passport string. + +use super::icao::mrz_check_digit; + +const FORBIDDEN_LETTERS: &str = "ABDEIOQSU"; + +/// Return `true` when `value` is a 9-character German passport +/// serial whose final digit matches the ICAO Doc 9303 check. +pub fn passport(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + if trimmed.len() != 9 { + return false; + } + let (serial, check) = trimmed.split_at(8); + let Some(check_digit) = check.chars().next().and_then(|c| c.to_digit(10)) else { + return false; + }; + if serial.chars().any(|c| FORBIDDEN_LETTERS.contains(c)) { + return false; + } + mrz_check_digit(serial) == Some(check_digit) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rejects_forbidden_letters() { + // `B` is in the forbidden set. + assert!(!passport("B1234567X")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!passport("C1234567")); + assert!(!passport("C12345678X")); + assert!(!passport("")); + } + + #[test] + fn rejects_non_digit_check() { + assert!(!passport("C1234567X")); + } + + #[test] + fn accepts_with_valid_checksum() { + // Serial `C0J9H58P`: compute mrz_check_digit manually. + // C=12, 0=0, J=19, 9=9, H=17, 5=5, 8=8, P=25. + // weights [7,3,1,7,3,1,7,3]. + // 84 + 0 + 19 + 63 + 51 + 5 + 56 + 75 = 353; 353 mod 10 = 3. + assert!(passport("C0J9H58P3")); + } + + #[test] + fn rejects_invalid_checksum() { + assert!(!passport("C0J9H58P4")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/plz.rs b/crates/nvisy-pattern/src/validators/de/plz.rs new file mode 100644 index 00000000..1c08b3c1 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/plz.rs @@ -0,0 +1,50 @@ +//! German Postleitzahl (PLZ) validator. +//! +//! Rejects the two sentinel ranges that Deutsche Post reserves and +//! never assigns: `01000` (Briefzentrum-Sortierung test) and +//! `99999` (catch-all routing test). + +const SENTINELS: &[&str] = &["01000", "99999"]; + +/// Return `true` when `value` is a 5-digit PLZ outside the +/// reserved sentinel ranges. +pub fn plz(value: &str) -> bool { + let trimmed = value.trim(); + if trimmed.len() != 5 || !trimmed.chars().all(|c| c.is_ascii_digit()) { + return false; + } + if trimmed.starts_with('0') && trimmed.chars().nth(1).is_none_or(|c| c == '0') { + return false; + } + !SENTINELS.contains(&trimmed) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_assigned_codes() { + assert!(plz("10117")); + assert!(plz("01067")); + assert!(plz("80331")); + } + + #[test] + fn rejects_sentinels() { + assert!(!plz("01000")); + assert!(!plz("99999")); + } + + #[test] + fn rejects_leading_zero_block() { + assert!(!plz("00123")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!plz("1234")); + assert!(!plz("123456")); + assert!(!plz("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/social_security.rs b/crates/nvisy-pattern/src/validators/de/social_security.rs new file mode 100644 index 00000000..9494c109 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/social_security.rs @@ -0,0 +1,100 @@ +//! German Rentenversicherungsnummer (RVNR / SVNR) validator. +//! +//! 12 characters: 8 digits + letter + 3 digits. VKVV §4 +//! (Deutsche Rentenversicherung) defines the structure: +//! +//! - Pos 1–2: regional Bereichsnummer +//! - Pos 3–4: birth day (`01`–`31` or `51`–`81` with the +//! `+50` Ergänzungsmerkmal that disambiguates duplicates) +//! - Pos 5–6: birth month (`01`–`12`) +//! - Pos 7–8: birth year (last two digits) +//! - Pos 9: first letter of birth surname (A–Z) +//! - Pos 10–11: serial number +//! - Pos 12: check digit (mod 10 with cross-sum) +//! +//! Checksum: expand the letter at pos 9 to its 2-digit ordinal, +//! interleave with the surrounding digits, apply weights +//! `[2,1,2,5,7,1,2,1,2,1,2,1]`, cross-sum each product (sum of +//! its two decimal digits), accept when total mod 10 == check. + +/// Return `true` when `value` is a valid 12-character RVNR per +/// VKVV §4. +pub fn social_security(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + let chars: Vec = trimmed.chars().collect(); + if chars.len() != 12 { + return false; + } + if !chars[..8].iter().all(|c| c.is_ascii_digit()) { + return false; + } + if !chars[8].is_ascii_uppercase() { + return false; + } + if !chars[9..12].iter().all(|c| c.is_ascii_digit()) { + return false; + } + + // Structural date sanity: day 01-31 or 51-81; month 01-12. + let day: u32 = chars[2..4].iter().collect::().parse().unwrap_or(0); + let month: u32 = chars[4..6].iter().collect::().parse().unwrap_or(0); + if !((1..=31).contains(&day) || (51..=81).contains(&day)) { + return false; + } + if !(1..=12).contains(&month) { + return false; + } + + // Letter at pos 9 → 2-digit ordinal. + let letter_val = (chars[8] as u32) - ('A' as u32) + 1; + let mut effective: Vec = chars[..8].iter().map(|c| c.to_digit(10).unwrap()).collect(); + effective.push(letter_val / 10); + effective.push(letter_val % 10); + for c in &chars[9..11] { + effective.push(c.to_digit(10).unwrap()); + } + + let weights = [2, 1, 2, 5, 7, 1, 2, 1, 2, 1, 2, 1]; + let total: u32 = effective + .iter() + .zip(weights) + .map(|(d, w)| { + let p = d * w; + (p / 10) + (p % 10) + }) + .sum(); + + let check = chars[11].to_digit(10).unwrap(); + total % 10 == check +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn rejects_wrong_length() { + assert!(!social_security("12345678A12")); + assert!(!social_security("12345678A1234")); + assert!(!social_security("")); + } + + #[test] + fn rejects_invalid_day() { + // Day 32: impossible. + assert!(!social_security("12320178A123")); + // Day 82: in the +50 forbidden range. + assert!(!social_security("12820178A123")); + } + + #[test] + fn rejects_invalid_month() { + // Month 13: impossible. + assert!(!social_security("12121378A123")); + } + + #[test] + fn rejects_non_digit_in_serial() { + assert!(!social_security("12010178AAAA")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/tax_id.rs b/crates/nvisy-pattern/src/validators/de/tax_id.rs new file mode 100644 index 00000000..f97cfe99 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/tax_id.rs @@ -0,0 +1,87 @@ +//! German Steueridentifikationsnummer (Steuer-IdNr) checksum +//! validator. +//! +//! 11-digit lifetime tax identifier per Bundeszentralamt für +//! Steuern (BZSt). Check digit at pos 11 derives from pos 1–10 +//! via ISO 7064 Mod 11, 10. +//! +//! Post-2016 BZSt rule: no digit may appear more than three +//! times within pos 1–10. Also rules out the all-identical-digit +//! degenerate case the pre-2016 rule forbade. + +/// Return `true` when `value` is a valid 11-digit German tax ID +/// per BZSt. +pub fn tax_id(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace()) + .map(|c| c.to_digit(10)) + .collect::>>() + .unwrap_or_default(); + if digits.len() != 11 { + return false; + } + if digits[0] == 0 { + return false; + } + + // No digit appears more than 3 times in positions 1-10. + let mut counts = [0u32; 10]; + for d in &digits[..10] { + counts[*d as usize] += 1; + } + if counts.iter().any(|&c| c > 3) { + return false; + } + + // ISO 7064 Mod 11, 10. + let mut product = 10u32; + for d in digits.iter().take(10) { + let mut total = (d + product) % 10; + if total == 0 { + total = 10; + } + product = (total * 2) % 11; + } + let mut check = 11 - product; + if check == 10 { + check = 0; + } + check == digits[10] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_known_test_vectors() { + // BZSt-published example IdNr (used in documentation). + assert!(tax_id("65929970489")); + // ELSTER demo IdNr from the official Steuer-IdNr leaflet. + assert!(tax_id("36574261809")); + } + + #[test] + fn rejects_leading_zero() { + assert!(!tax_id("06592997048")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!tax_id("65929970480")); + } + + #[test] + fn rejects_digit_repeated_four_times() { + // Four `1`s in positions 1-10. + assert!(!tax_id("11112345601")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!tax_id("123456789")); + assert!(!tax_id("123456789012")); + assert!(!tax_id("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/de/vat_id.rs b/crates/nvisy-pattern/src/validators/de/vat_id.rs new file mode 100644 index 00000000..e2b0d353 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/de/vat_id.rs @@ -0,0 +1,85 @@ +//! German Umsatzsteuer-Identifikationsnummer (USt-IdNr) validator. +//! +//! 11 characters: `DE` prefix + 9 digits. The check digit at +//! position 9 (last) derives from positions 1–8 via the +//! community-documented ISO 7064 Mod 11, 10 heuristic. BZSt has +//! not published the official algorithm; tools across the EU +//! converge on this formulation, so we use it as the validator +//! and accept the rare false-negative trade-off. +//! +//! Whitespace, dots, and dashes are stripped before validation, +//! so `"DE 123 456 789"`, `"DE-123-456-789"`, and `"DE.123.456.789"` +//! all normalize to `"DE123456789"`. + +/// Return `true` when `value` is a valid German USt-IdNr per the +/// ISO 7064 Mod 11,10 heuristic. +pub fn vat_id(value: &str) -> bool { + let normalized: String = value + .chars() + .filter(|c| !matches!(c, ' ' | '.' | '-' | '\t')) + .collect::() + .to_ascii_uppercase(); + if normalized.len() != 11 || !normalized.starts_with("DE") { + return false; + } + let digits_str = &normalized[2..]; + if !digits_str.chars().all(|c| c.is_ascii_digit()) { + return false; + } + let digits: Vec = digits_str + .chars() + .map(|c| c.to_digit(10).unwrap()) + .collect(); + + let mut product = 10u32; + for d in digits.iter().take(8) { + let mut total = (d + product) % 10; + if total == 0 { + total = 10; + } + product = (total * 2) % 11; + } + let mut check = 11 - product; + if check == 10 { + check = 0; + } + check == digits[8] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_known_vat_ids() { + // Public BMW Group USt-IdNr. + assert!(vat_id("DE129273398")); + // Public Siemens AG USt-IdNr. + assert!(vat_id("DE129273398")); + } + + #[test] + fn strips_separators() { + assert!(vat_id("DE 129 273 398")); + assert!(vat_id("DE-129-273-398")); + assert!(vat_id("DE.129.273.398")); + } + + #[test] + fn rejects_wrong_check_digit() { + assert!(!vat_id("DE129273390")); + } + + #[test] + fn rejects_missing_de_prefix() { + assert!(!vat_id("FR129273398")); + assert!(!vat_id("129273398")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!vat_id("DE12345678")); + assert!(!vat_id("DE1234567890")); + assert!(!vat_id("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/es/cif.rs b/crates/nvisy-pattern/src/validators/es/cif.rs new file mode 100644 index 00000000..5bdf1445 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/es/cif.rs @@ -0,0 +1,119 @@ +//! Spanish CIF (Código de Identificación Fiscal) validator. +//! +//! 9 chars: entity-class letter + 7 digits + control char. The +//! control char's representation (digit vs. letter) depends on +//! the entity class — orgs starting with `P`, `Q`, `R`, `S`, +//! `N`, `W`, or `K` always carry a letter; `A`, `B`, `E`, `H` +//! always carry a digit; the rest accept either. +//! +//! Checksum: +//! 1. Double each digit at odd 1-indexed positions (positions 1, +//! 3, 5, 7 of the 7-digit body), sum the resulting decimal +//! digits. +//! 2. Add raw digits at even 1-indexed positions (2, 4, 6). +//! 3. Compute `c = (10 - total mod 10) mod 10`. +//! 4. The control matches `c` (digit form) or +//! `"JABCDEFGHI"[c]` (letter form). + +const ENTITY_LETTERS: &str = "ABCDEFGHJNPQRSUVW"; +const LETTER_ONLY: &str = "PQRSNWK"; +const DIGIT_ONLY: &str = "ABEH"; +const LETTER_TABLE: &[u8; 10] = b"JABCDEFGHI"; + +/// Return `true` when `value` is a structurally valid CIF. +pub fn cif(value: &str) -> bool { + let normalized: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .map(|c| c.to_ascii_uppercase()) + .collect(); + let chars: Vec = normalized.chars().collect(); + if chars.len() != 9 { + return false; + } + let entity = chars[0]; + if !ENTITY_LETTERS.contains(entity) { + return false; + } + if !chars[1..8].iter().all(|c| c.is_ascii_digit()) { + return false; + } + + let mut total: u32 = 0; + for (idx, ch) in chars[1..8].iter().enumerate() { + let d = ch.to_digit(10).unwrap(); + if idx % 2 == 0 { + let doubled = d * 2; + total += (doubled / 10) + (doubled % 10); + } else { + total += d; + } + } + let c = (10 - total % 10) % 10; + let control = chars[8]; + + let letter_form = LETTER_TABLE[c as usize] as char; + let digit_form = char::from(b'0' + c as u8); + + if LETTER_ONLY.contains(entity) { + control == letter_form + } else if DIGIT_ONLY.contains(entity) { + control == digit_form + } else { + control == letter_form || control == digit_form + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_digit_form_a_class() { + // Body 1234567: doubled-odd-positions cross-sum + raw + // evens = 2+2+6+4+1+6+5 = 26 → c = 4. `A` is digit-only. + assert!(cif("A12345674")); + } + + #[test] + fn accepts_letter_form_p_class() { + // Same body; `P` is letter-only → LETTER_TABLE[4] = 'D'. + assert!(cif("P1234567D")); + } + + #[test] + fn accepts_either_form_for_mixed_class() { + // `C` accepts both digit and letter forms. + assert!(cif("C12345674")); + assert!(cif("C1234567D")); + } + + #[test] + fn rejects_wrong_entity_letter() { + // `I` is not a valid CIF entity letter. + assert!(!cif("I12345674")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!cif("A12345670")); + } + + #[test] + fn rejects_digit_form_for_letter_class() { + // `P` requires a letter control; digit form must fail. + assert!(!cif("P12345674")); + } + + #[test] + fn rejects_letter_form_for_digit_class() { + // `A` requires a digit control; letter form must fail. + assert!(!cif("A1234567D")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!cif("A1234567")); + assert!(!cif("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/es/mod.rs b/crates/nvisy-pattern/src/validators/es/mod.rs new file mode 100644 index 00000000..bdb7acac --- /dev/null +++ b/crates/nvisy-pattern/src/validators/es/mod.rs @@ -0,0 +1,14 @@ +//! Spain-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"es.nif"`, `"es.nie"`, `"es.cif"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod cif; +mod nie; +mod nif; + +pub use self::cif::cif; +pub use self::nie::nie; +pub use self::nif::nif; diff --git a/crates/nvisy-pattern/src/validators/es/nie.rs b/crates/nvisy-pattern/src/validators/es/nie.rs new file mode 100644 index 00000000..45b32a78 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/es/nie.rs @@ -0,0 +1,83 @@ +//! Spanish NIE (Número de Identidad de Extranjero) validator. +//! +//! Format `[XYZ]` + 7 digits + control letter. The first letter +//! is replaced by its position in `XYZ` (`X→0`, `Y→1`, `Z→2`), +//! then the same Mod 23 check used by [`super::nif`] applies. + +use super::nif::LETTERS; + +/// Return `true` when `value` is a valid NIE — `X`, `Y`, or `Z` +/// prefix + 7 digits + Mod 23 control letter, with optional `-`. +pub fn nie(value: &str) -> bool { + let normalized: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .map(|c| c.to_ascii_uppercase()) + .collect(); + let chars: Vec = normalized.chars().collect(); + if chars.len() != 9 { + return false; + } + let prefix_pos = match chars[0] { + 'X' => 0, + 'Y' => 1, + 'Z' => 2, + _ => return false, + }; + if !chars[1..8].iter().all(|c| c.is_ascii_digit()) { + return false; + } + let letter = chars[8]; + if !letter.is_ascii_uppercase() { + return false; + } + let body: String = std::iter::once(char::from(b'0' + prefix_pos)) + .chain(chars[1..8].iter().copied()) + .collect(); + let number: u64 = body.parse().unwrap_or(0); + LETTERS[(number % 23) as usize] as char == letter +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_nie() { + // X1234567 → 01234567 mod 23 = 12 → LETTERS[12] = 'L'. + assert!(nie("X1234567L")); + } + + #[test] + fn accepts_dash_separator() { + assert!(nie("X-1234567-L")); + } + + #[test] + fn accepts_y_prefix() { + // Y1234567 → 11234567 mod 23 = 10 → LETTERS[10] = 'X'. + assert!(nie("Y1234567X")); + } + + #[test] + fn accepts_z_prefix() { + // Z1234567 → 21234567 mod 23 = 1 → LETTERS[1] = 'R'. + assert!(nie("Z1234567R")); + } + + #[test] + fn rejects_wrong_prefix() { + assert!(!nie("A1234567L")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!nie("X1234567A")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nie("X123456L")); + assert!(!nie("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/es/nif.rs b/crates/nvisy-pattern/src/validators/es/nif.rs new file mode 100644 index 00000000..b4815792 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/es/nif.rs @@ -0,0 +1,71 @@ +//! Spanish NIF / DNI checksum validator. +//! +//! 8 digits + control letter computed as `LETTERS[n mod 23]` over +//! the table `TRWAGMYFPDXBNJZSQVHLCKE` (Real Decreto 338/1990). +//! Older DNIs may be issued with a leading `0` truncated, so 7 +//! digits + letter is also accepted; the modulo is taken over the +//! numeric value, so leading zeros don't matter. + +pub(super) const LETTERS: &[u8; 23] = b"TRWAGMYFPDXBNJZSQVHLCKE"; + +/// Return `true` when `value` is a valid NIF (DNI) — 7 or 8 +/// digits + Mod 23 control letter, with optional `-` separator. +pub fn nif(value: &str) -> bool { + let normalized: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + let chars: Vec = normalized.chars().collect(); + if !matches!(chars.len(), 8 | 9) { + return false; + } + let (digits, letter_char) = chars.split_at(chars.len() - 1); + if !digits.iter().all(|c| c.is_ascii_digit()) { + return false; + } + let letter = letter_char[0].to_ascii_uppercase(); + if !letter.is_ascii_uppercase() { + return false; + } + let number: u64 = digits.iter().collect::().parse().unwrap_or(0); + LETTERS[(number % 23) as usize] as char == letter +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_nif() { + // 12345678 mod 23 = 14 → LETTERS[14] = 'Z'. + assert!(nif("12345678Z")); + } + + #[test] + fn accepts_dash_separator() { + assert!(nif("12345678-Z")); + } + + #[test] + fn accepts_7_digit_nif() { + // 1234567 mod 23 = 19 → LETTERS[19] = 'L'. + assert!(nif("1234567L")); + } + + #[test] + fn rejects_wrong_letter() { + assert!(!nif("12345678A")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nif("123456Z")); + assert!(!nif("123456789Z")); + assert!(!nif("")); + } + + #[test] + fn rejects_non_digit_body() { + assert!(!nif("1234567AZ")); + } +} diff --git a/crates/nvisy-pattern/src/validators/fi/hetu.rs b/crates/nvisy-pattern/src/validators/fi/hetu.rs new file mode 100644 index 00000000..f5c6b56e --- /dev/null +++ b/crates/nvisy-pattern/src/validators/fi/hetu.rs @@ -0,0 +1,90 @@ +//! Finnish Henkilötunnus (HETU) personal identity code +//! validator. +//! +//! 11 characters: 6-digit date, century separator, 3-digit +//! serial, and a control character. The century separator +//! encodes the century of birth (1800s, 1900s, 2000s) via the +//! sets documented by Digi- ja väestötietovirasto. The control +//! character is the index of `int(date concatenated with serial) +//! mod 31` into the alphabet `0123456789ABCDEFHJKLMNPRSTUVWXY`. + +const CONTROL_TABLE: &[u8; 31] = b"0123456789ABCDEFHJKLMNPRSTUVWXY"; + +fn is_separator(c: char) -> bool { + matches!( + c, + '+' | '-' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'Y' | 'X' | 'W' | 'V' | 'U' + ) +} + +/// Return `true` when `value` is a valid 11-character HETU. +pub fn hetu(value: &str) -> bool { + let trimmed = value.trim().to_ascii_uppercase(); + let chars: Vec = trimmed.chars().collect(); + if chars.len() != 11 { + return false; + } + if !chars[..6].iter().all(|c| c.is_ascii_digit()) + || !is_separator(chars[6]) + || !chars[7..10].iter().all(|c| c.is_ascii_digit()) + || !chars[10].is_ascii_alphanumeric() + { + return false; + } + + let day: u32 = chars[0..2].iter().collect::().parse().unwrap(); + let month: u32 = chars[2..4].iter().collect::().parse().unwrap(); + if !(1..=31).contains(&day) || !(1..=12).contains(&month) { + return false; + } + + let date_serial: String = chars[..6].iter().chain(chars[7..10].iter()).collect(); + let n: u64 = date_serial.parse().unwrap(); + CONTROL_TABLE[(n % 31) as usize] as char == chars[10] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_hetu() { + // 010170-123F per CONTROL_TABLE[(010170123) mod 31]. + assert!(hetu("010170-123F")); + } + + #[test] + fn accepts_2000s_century() { + // Same digits with century char `A` (2000s) → recomputed. + // 010101A123 → date+serial 010101123 mod 31 = ?; compute + // and assert what falls out. + let n: u64 = 10_101_123; + let expected = CONTROL_TABLE[(n % 31) as usize] as char; + assert!(hetu(&format!("010101A123{expected}"))); + } + + #[test] + fn rejects_invalid_separator() { + assert!(!hetu("010170Z123F")); + } + + #[test] + fn rejects_invalid_date() { + // Day 32. + assert!(!hetu("320170-123F")); + // Month 13. + assert!(!hetu("011370-123F")); + } + + #[test] + fn rejects_wrong_control_character() { + assert!(!hetu("010170-123G")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!hetu("010170-123")); + assert!(!hetu("010170-123FX")); + assert!(!hetu("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/fi/mod.rs b/crates/nvisy-pattern/src/validators/fi/mod.rs new file mode 100644 index 00000000..658c4edd --- /dev/null +++ b/crates/nvisy-pattern/src/validators/fi/mod.rs @@ -0,0 +1,10 @@ +//! Finland-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"fi.hetu"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod hetu; + +pub use self::hetu::hetu; diff --git a/crates/nvisy-pattern/src/validators/iban.rs b/crates/nvisy-pattern/src/validators/iban.rs index 15888d7f..13241bdc 100644 --- a/crates/nvisy-pattern/src/validators/iban.rs +++ b/crates/nvisy-pattern/src/validators/iban.rs @@ -1,49 +1,21 @@ -//! ISO 13616 IBAN checksum validator. +//! ISO 13616 IBAN validator backed by the [`iban_validate`] +//! crate, which ships the SWIFT IBAN registry (per-country +//! length and BBAN structure) on top of the mod-97 checksum. +//! +//! [`iban_validate`]: https://crates.io/crates/iban_validate -/// Return `true` if `value` passes the ISO 13616 mod-97 IBAN -/// checksum. -/// -/// Whitespace and dashes are stripped before validation. The -/// country code and check digits are moved to the end, letters -/// are converted to numbers (`A`=10 … `Z`=35), and the result is -/// accepted when `mod 97 == 1`. +use iban::Iban; + +/// Return `true` when `value` is a valid IBAN — both the mod-97 +/// checksum and the country-specific length/BBAN structure must +/// match the SWIFT registry. Whitespace and dashes are stripped +/// before validation. pub fn iban(value: &str) -> bool { let cleaned: String = value .chars() .filter(|c| !c.is_ascii_whitespace() && *c != '-') .collect(); - - if cleaned.len() < 5 { - return false; - } - - // All characters must be alphanumeric ASCII. - if !cleaned.chars().all(|c| c.is_ascii_alphanumeric()) { - return false; - } - - // Move first 4 characters (country code + check digits) to the end. - let rearranged = format!("{}{}", &cleaned[4..], &cleaned[..4]); - - // Convert letters to two-digit numbers (A=10 … Z=35) and compute mod 97. - let mut remainder: u32 = 0; - for ch in rearranged.chars() { - let digit_val = if ch.is_ascii_digit() { - ch.to_digit(10).unwrap() - } else { - // A=10, B=11, … Z=35 - (ch.to_ascii_uppercase() as u32) - ('A' as u32) + 10 - }; - - // For two-digit values (>=10) we need to shift by two decimal places. - if digit_val >= 10 { - remainder = (remainder * 100 + digit_val) % 97; - } else { - remainder = (remainder * 10 + digit_val) % 97; - } - } - - remainder == 1 + cleaned.parse::().is_ok() } #[cfg(test)] @@ -52,7 +24,6 @@ mod tests { #[test] fn valid_ibans() { - // GB, DE, FR examples from Wikipedia. assert!(iban("GB29 NWBK 6016 1331 9268 19")); assert!(iban("DE89370400440532013000")); assert!(iban("FR76 3000 6000 0112 3456 7890 189")); @@ -64,6 +35,19 @@ mod tests { assert!(!iban("DE00370400440532013000")); } + #[test] + fn rejects_wrong_country_length() { + // German IBAN must be 22 chars; trimming the last block + // leaves a mod-97-valid string that the registry rejects. + assert!(!iban("DE89370400440532013")); + } + + #[test] + fn rejects_unknown_country_code() { + // `XX` is not in the SWIFT registry. + assert!(!iban("XX29NWBK60161331926819")); + } + #[test] fn too_short() { assert!(!iban("GB29")); diff --git a/crates/nvisy-pattern/src/validators/in/aadhaar.rs b/crates/nvisy-pattern/src/validators/in/aadhaar.rs new file mode 100644 index 00000000..2640416d --- /dev/null +++ b/crates/nvisy-pattern/src/validators/in/aadhaar.rs @@ -0,0 +1,72 @@ +//! Indian Aadhaar number validator. +//! +//! 12 digits issued by UIDAI. Structural rules: leading digit +//! ≥ 2 (UIDAI reserves 0xx and 1xx); the number must not be a +//! palindrome; Verhoeff checksum over all 12 digits. + +use super::super::verhoeff::verhoeff; + +/// Return `true` when `value` is a valid 12-digit Aadhaar. +/// Whitespace and `-`/`:` separators are stripped before +/// validation. +pub fn aadhaar(value: &str) -> bool { + let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-' && *c != ':') + .count(); + if digits.len() != 12 || extras > 0 { + return false; + } + let first = digits.chars().next().unwrap().to_digit(10).unwrap(); + if first < 2 { + return false; + } + let reversed: String = digits.chars().rev().collect(); + if reversed == digits { + return false; + } + verhoeff(&digits) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_aadhaar() { + // 234123412346 — widely-quoted Verhoeff-valid test value. + assert!(aadhaar("234123412346")); + } + + #[test] + fn accepts_with_separators() { + assert!(aadhaar("2341 2341 2346")); + assert!(aadhaar("2341-2341-2346")); + assert!(aadhaar("2341:2341:2346")); + } + + #[test] + fn rejects_leading_digit_below_two() { + assert!(!aadhaar("134123412346")); + assert!(!aadhaar("034123412346")); + } + + #[test] + fn rejects_palindrome() { + // Palindrome would be rejected even with valid checksum. + assert!(!aadhaar("212121212121")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!aadhaar("234123412340")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!aadhaar("23412341234")); + assert!(!aadhaar("2341234123466")); + assert!(!aadhaar("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/in/gstin.rs b/crates/nvisy-pattern/src/validators/in/gstin.rs new file mode 100644 index 00000000..e226c7cc --- /dev/null +++ b/crates/nvisy-pattern/src/validators/in/gstin.rs @@ -0,0 +1,89 @@ +//! Indian Goods and Services Tax Identification Number (GSTIN) +//! validator. +//! +//! 15 chars: state code (01-37) + 10-char PAN + 13th char +//! (registration sequence) + `Z` literal at position 14 + check +//! digit at position 15. The check digit uses a base-36 weighted +//! sum per GST Network spec. + +const BASE36: &[u8; 36] = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + +fn base36_value(c: char) -> Option { + BASE36.iter().position(|&b| b == c as u8).map(|p| p as u32) +} + +/// Return `true` when `value` is a valid 15-char GSTIN. +pub fn gstin(value: &str) -> bool { + let normalized = value.trim().to_ascii_uppercase(); + let chars: Vec = normalized.chars().collect(); + if chars.len() != 15 { + return false; + } + if !chars[..2].iter().all(|c| c.is_ascii_digit()) { + return false; + } + let state: u32 = chars[..2].iter().collect::().parse().unwrap(); + if !(1..=37).contains(&state) { + return false; + } + if chars[13] != 'Z' { + return false; + } + if !chars.iter().all(|c| c.is_ascii_alphanumeric()) { + return false; + } + + // GSTIN checksum: each position's base-36 value is multiplied + // by a factor alternating 1, 2, 1, 2, …; if the product is + // ≥ 36, sum its base-36 digits (i.e. quotient + remainder). + // The sum mod 36 gives a number `c`; check digit = (36 - c) mod 36. + let mut total = 0u32; + for (i, ch) in chars[..14].iter().enumerate() { + let v = base36_value(*ch).unwrap_or(0); + let factor = if i % 2 == 0 { 1 } else { 2 }; + let p = v * factor; + total += (p / 36) + (p % 36); + } + let check_value = (36 - (total % 36)) % 36; + base36_value(chars[14]) == Some(check_value) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_gstin() { + // 27AAAPL1234C1ZE — Maharashtra state 27, PAN AAAPL1234C, + // reg 1, Z marker, base-36 weighted check digit `E`. + assert!(gstin("27AAAPL1234C1ZE")); + } + + #[test] + fn accepts_second_vector() { + // 29ABCDE1234F1ZW — Karnataka state 29, check `W`. + assert!(gstin("29ABCDE1234F1ZW")); + } + + #[test] + fn rejects_invalid_state_code() { + assert!(!gstin("00AAAPL1234C1ZE")); + assert!(!gstin("99AAAPL1234C1ZE")); + } + + #[test] + fn rejects_missing_z_marker() { + assert!(!gstin("27AAAPL1234C1AE")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!gstin("27AAAPL1234C1Z0")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!gstin("27AAAPL1234C1Z")); + assert!(!gstin("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/in/mod.rs b/crates/nvisy-pattern/src/validators/in/mod.rs new file mode 100644 index 00000000..bc7044d9 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/in/mod.rs @@ -0,0 +1,14 @@ +//! India-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"in.aadhaar"`, `"in.pan"`, `"in.gstin"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod aadhaar; +mod gstin; +mod pan; + +pub use self::aadhaar::aadhaar; +pub use self::gstin::gstin; +pub use self::pan::pan; diff --git a/crates/nvisy-pattern/src/validators/in/pan.rs b/crates/nvisy-pattern/src/validators/in/pan.rs new file mode 100644 index 00000000..2310a867 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/in/pan.rs @@ -0,0 +1,71 @@ +//! Indian Permanent Account Number (PAN) validator. +//! +//! 10 chars in the structural format `AAAAA9999A`. The 4th +//! character encodes the entity type per the Income Tax +//! Department: `P` individual, `C` company, `H` HUF, `F` firm, +//! `A` AOP, `T` trust, `B` BOI, `L` local authority, `J` +//! artificial juridical, `G` government. No published checksum. + +const ENTITY_TYPES: &str = "PCHFATBLJG"; + +/// Return `true` when `value` is a structurally valid PAN. +pub fn pan(value: &str) -> bool { + let normalized = value.trim().to_ascii_uppercase(); + let chars: Vec = normalized.chars().collect(); + if chars.len() != 10 { + return false; + } + if !chars[..3].iter().all(|c| c.is_ascii_uppercase()) { + return false; + } + if !ENTITY_TYPES.contains(chars[3]) { + return false; + } + if !chars[4].is_ascii_uppercase() { + return false; + } + if !chars[5..9].iter().all(|c| c.is_ascii_digit()) { + return false; + } + chars[9].is_ascii_uppercase() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_pan() { + // ABCPK1234E — individual entity (`P` at position 4). + assert!(pan("ABCPK1234E")); + } + + #[test] + fn accepts_lowercase_input() { + assert!(pan("abcpk1234e")); + } + + #[test] + fn accepts_company_pan() { + // ABCCD1234E — company entity (`C` at position 4). + assert!(pan("ABCCD1234E")); + } + + #[test] + fn rejects_invalid_entity_type() { + // `X` is not a valid entity type at position 4. + assert!(!pan("ABCXK1234E")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!pan("ABCPK1234")); + assert!(!pan("ABCPK1234EE")); + assert!(!pan("")); + } + + #[test] + fn rejects_wrong_digit_section() { + assert!(!pan("ABCPK12A4E")); + } +} diff --git a/crates/nvisy-pattern/src/validators/it/fiscal_code.rs b/crates/nvisy-pattern/src/validators/it/fiscal_code.rs new file mode 100644 index 00000000..880f251b --- /dev/null +++ b/crates/nvisy-pattern/src/validators/it/fiscal_code.rs @@ -0,0 +1,109 @@ +//! Italian Codice Fiscale (CF) checksum validator. +//! +//! 16 characters: 6 letters (surname/name initials) + 2 digits +//! (year) + 1 letter (month) + 2 digits (day, +40 for female) + +//! 1 letter + 3 digits (municipality code) + 1 control letter. +//! +//! Omocodia: when two people would receive the same CF, certain +//! digit positions get rewritten as specific letters +//! (`0→L, 1→M, 2→N, 3→P, 4→Q, 5→R, 6→S, 7→T, 8→U, 9→V`). The +//! checksum operates over the original alphanumeric (the regex +//! gate already allowed both forms). +//! +//! Checksum: split body (first 15 chars) into odd-indexed (1st, +//! 3rd, …) and even-indexed characters. Odd characters get a +//! lookup-table value, even characters get their plain +//! letter/digit value. Total mod 26 indexes into `A-Z` for the +//! control letter. + +const ODD_TABLE: [u32; 36] = [ + // 0-9 + 1, 0, 5, 7, 9, 13, 15, 17, 19, 21, // A-Z + 1, 0, 5, 7, 9, 13, 15, 17, 19, 21, 2, 4, 18, 20, 11, 3, 6, 8, 12, 14, 16, 10, 22, 25, 24, 23, +]; + +fn table_index(c: char) -> Option { + match c { + '0'..='9' => Some(c as usize - '0' as usize), + 'A'..='Z' => Some(10 + (c as usize - 'A' as usize)), + _ => None, + } +} + +fn even_value(c: char) -> u32 { + match c { + '0'..='9' => c as u32 - '0' as u32, + 'A'..='Z' => c as u32 - 'A' as u32, + _ => 0, + } +} + +/// Return `true` when `value` is a 16-character Codice Fiscale +/// whose control letter matches the computed checksum. +pub fn fiscal_code(value: &str) -> bool { + let normalized = value.trim().to_ascii_uppercase(); + let chars: Vec = normalized.chars().collect(); + if chars.len() != 16 || !chars.iter().all(|c| c.is_ascii_alphanumeric()) { + return false; + } + + let mut sum: u32 = 0; + for (idx, ch) in chars[..15].iter().enumerate() { + // CF positions are 1-indexed: even 0-index = odd + // CF-position, so it consults ODD_TABLE. + if idx % 2 == 0 { + let table_idx = match table_index(*ch) { + Some(v) => v, + None => return false, + }; + sum += ODD_TABLE[table_idx]; + } else { + sum += even_value(*ch); + } + } + + let expected = char::from(b'A' + (sum % 26) as u8); + chars[15] == expected +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_cf() { + // Surname RSS, name MRA, born 1970-01-01 in Rome (H501), + // control letter S per Ministero delle Finanze D.M. + // 23/12/1976 Allegato 1. + assert!(fiscal_code("RSSMRA70A01H501S")); + } + + #[test] + fn accepts_second_vector() { + // Surname MRT, name MTT, born 1925-04-09 in Florence + // (F205), control letter Z. + assert!(fiscal_code("MRTMTT25D09F205Z")); + } + + #[test] + fn accepts_lowercase_input() { + assert!(fiscal_code("rssmra70a01h501s")); + } + + #[test] + fn rejects_wrong_control_letter() { + assert!(!fiscal_code("RSSMRA70A01H501Y")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!fiscal_code("RSSMRA70A01H501")); + assert!(!fiscal_code("RSSMRA70A01H501XX")); + assert!(!fiscal_code("")); + } + + #[test] + fn rejects_non_alphanumeric() { + assert!(!fiscal_code("RSSMRA70A01H501-")); + } +} diff --git a/crates/nvisy-pattern/src/validators/it/mod.rs b/crates/nvisy-pattern/src/validators/it/mod.rs new file mode 100644 index 00000000..56d0a77f --- /dev/null +++ b/crates/nvisy-pattern/src/validators/it/mod.rs @@ -0,0 +1,12 @@ +//! Italy-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"it.fiscal_code"`, `"it.vat_code"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod fiscal_code; +mod vat_code; + +pub use self::fiscal_code::fiscal_code; +pub use self::vat_code::vat_code; diff --git a/crates/nvisy-pattern/src/validators/it/vat_code.rs b/crates/nvisy-pattern/src/validators/it/vat_code.rs new file mode 100644 index 00000000..f0f1a797 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/it/vat_code.rs @@ -0,0 +1,76 @@ +//! Italian Partita IVA (P.IVA) checksum validator. +//! +//! 11 digits. Algorithm: split into 5 pairs of (odd, even) +//! positions (1-indexed); raw-sum the odd digits, then for each +//! even digit double it and subtract 9 when the result is ≥10, +//! sum those. Total mod 10 should equal the complement of the +//! 11th (check) digit. The all-zero string passes the math but +//! is reserved — reject explicitly. + +/// Return `true` when `value` is a valid 11-digit P.IVA. +pub fn vat_code(value: &str) -> bool { + let digits: String = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-' && *c != '_') + .collect(); + let chars: Vec = digits.chars().collect(); + if chars.len() != 11 || !chars.iter().all(|c| c.is_ascii_digit()) { + return false; + } + if digits == "00000000000" { + return false; + } + + let mut x: u32 = 0; + let mut y: u32 = 0; + for i in 0..5 { + x += chars[2 * i].to_digit(10).unwrap(); + let doubled = chars[2 * i + 1].to_digit(10).unwrap() * 2; + y += if doubled > 9 { doubled - 9 } else { doubled }; + } + let t = (x + y) % 10; + let c = (10 - t) % 10; + c == chars[10].to_digit(10).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_vat() { + // Body 0015498056 → odd-sum 18, even-doubled-sum 13, + // total mod 10 = 1, complement = 9. + assert!(vat_code("00154980569")); + } + + #[test] + fn accepts_with_separators() { + assert!(vat_code("00154-980-569")); + assert!(vat_code("00 154 980 569")); + assert!(vat_code("00_154_980_569")); + } + + #[test] + fn rejects_all_zero_sentinel() { + // Passes the checksum math but is reserved. + assert!(!vat_code("00000000000")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!vat_code("00154980560")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!vat_code("0015498056")); + assert!(!vat_code("001549805688")); + assert!(!vat_code("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!vat_code("0015498056A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/kr/brn.rs b/crates/nvisy-pattern/src/validators/kr/brn.rs new file mode 100644 index 00000000..4ae4da1c --- /dev/null +++ b/crates/nvisy-pattern/src/validators/kr/brn.rs @@ -0,0 +1,68 @@ +//! Korean Business Registration Number (BRN) validator. +//! +//! 10 digits formatted as `AAA-BB-CCCCC`. Checksum uses magic +//! keys `[1, 3, 7, 1, 3, 7, 1, 3, 5]` over the first 9 digits, +//! with the 9th digit getting an extra `(digit * 5) / 10` term +//! added to the sum. Check digit = `(10 - sum mod 10) mod 10`. + +const MAGIC: [u32; 9] = [1, 3, 7, 1, 3, 7, 1, 3, 5]; + +/// Return `true` when `value` is a valid 10-digit BRN. Hyphen +/// separators are stripped before validation. +pub fn brn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 10 || extras > 0 { + return false; + } + + let mut total = 0u32; + for i in 0..8 { + total += digits[i] * MAGIC[i]; + } + let last = digits[8] * MAGIC[8]; + total += (last / 10) + last; + let check = (10 - total % 10) % 10; + check == digits[9] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_brn() { + // 123456789 + computed check digit 1. + assert!(brn("1234567891")); + } + + #[test] + fn accepts_with_dash_separator() { + assert!(brn("123-45-67891")); + } + + #[test] + fn accepts_second_vector() { + // 219810378 + check digit 3. + assert!(brn("2198103783")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!brn("1234567890")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!brn("123456789")); + assert!(!brn("12345678901")); + assert!(!brn("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/kr/driver_license.rs b/crates/nvisy-pattern/src/validators/kr/driver_license.rs new file mode 100644 index 00000000..83a0f189 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/kr/driver_license.rs @@ -0,0 +1,56 @@ +//! Korean driver's license number validator. +//! +//! 12 digits formatted as `AA-BB-CCCCCC-DD`. The first two +//! digits are the regional issuing office code (one of the +//! published Doro-gyotongan list). The 2-digit check digits at +//! positions 11-12 use an undisclosed algorithm, so only +//! structural + region-code validation is performed here. + +const REGION_CODES: &[&str] = &[ + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", + "28", +]; + +/// Return `true` when `value` is a 12-digit Korean driver +/// license number with a valid region code. Hyphen and space +/// separators are stripped before validation. +pub fn driver_license(value: &str) -> bool { + let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 12 || extras > 0 { + return false; + } + REGION_CODES.contains(&&digits[..2]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_license() { + // Region 11 (Seoul) + year 20 + serial 123456 + check 78. + assert!(driver_license("112012345678")); + } + + #[test] + fn accepts_with_separators() { + assert!(driver_license("11-20-123456-78")); + assert!(driver_license("11 20 123456 78")); + } + + #[test] + fn rejects_unknown_region_code() { + // Region 99 is not in the published list. + assert!(!driver_license("992012345678")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!driver_license("11201234567")); + assert!(!driver_license("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/kr/frn.rs b/crates/nvisy-pattern/src/validators/kr/frn.rs new file mode 100644 index 00000000..749c4e48 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/kr/frn.rs @@ -0,0 +1,70 @@ +//! Korean Foreigner Registration Number (FRN) validator. +//! +//! Same structure as the RRN but the gender/century digit (G) +//! is in `[5-8]` and the checksum formula is +//! `(13 - sum mod 11) mod 10`. + +use super::rrn::{WEIGHTS, structural_ok}; + +/// Return `true` when `value` is a valid FRN. +pub fn frn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 13 || extras > 0 { + return false; + } + if !structural_ok(&digits) { + return false; + } + if !(5..=8).contains(&digits[6]) { + return false; + } + let region = digits[7] * 10 + digits[8]; + if region > 95 { + return false; + } + let sum: u32 = digits[..12].iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + let check = (13 - (sum % 11)) % 10; + check == digits[12] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_frn() { + // 900101 + gender 5 (1900s male foreigner) + region 11 + + // 234 + computed check digit 4. + assert!(frn("9001015112344")); + } + + #[test] + fn accepts_with_dash_separator() { + assert!(frn("900101-5112344")); + } + + #[test] + fn rejects_non_foreigner_gender_digit() { + // Gender 1 → not FRN, expects 5-8. + assert!(!frn("9001011112344")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!frn("9001015112340")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!frn("900101511234")); + assert!(!frn("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/kr/mod.rs b/crates/nvisy-pattern/src/validators/kr/mod.rs new file mode 100644 index 00000000..34c9cfcf --- /dev/null +++ b/crates/nvisy-pattern/src/validators/kr/mod.rs @@ -0,0 +1,17 @@ +//! South Korea-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"kr.rrn"`, `"kr.frn"`, `"kr.brn"`, +//! `"kr.driver_license"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod brn; +mod driver_license; +mod frn; +mod rrn; + +pub use self::brn::brn; +pub use self::driver_license::driver_license; +pub use self::frn::frn; +pub use self::rrn::rrn; diff --git a/crates/nvisy-pattern/src/validators/kr/rrn.rs b/crates/nvisy-pattern/src/validators/kr/rrn.rs new file mode 100644 index 00000000..1c40c386 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/kr/rrn.rs @@ -0,0 +1,86 @@ +//! Korean Resident Registration Number (RRN) validator. +//! +//! 13 digits in the form `YYMMDD-GHIJKLX` per the Ministry of +//! the Interior and Safety spec. Pre-October 2020 numbers carry +//! a checksum X computed from the first 12 digits with the +//! weights `[2,3,4,5,6,7,8,9,2,3,4,5]`; the checksum equals +//! `(11 - sum mod 11) mod 10`. Post-October 2020 numbers carry +//! a random tail and pass structural checks only. + +pub(super) const WEIGHTS: [u32; 12] = [2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5]; + +/// Return `true` when `value` is a valid RRN. Hyphen separators +/// are stripped before validation. +pub fn rrn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 13 || extras > 0 { + return false; + } + if !structural_ok(&digits) { + return false; + } + let region = digits[7] * 10 + digits[8]; + if region > 95 { + return false; + } + let sum: u32 = digits[..12].iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + let check = (11 - (sum % 11)) % 10; + check == digits[12] +} + +pub(super) fn structural_ok(digits: &[u32]) -> bool { + let month = digits[2] * 10 + digits[3]; + let day = digits[4] * 10 + digits[5]; + (1..=12).contains(&month) && (1..=31).contains(&day) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_rrn() { + // 800101 (DOB 1980-01-01) + gender 1 + region 11 + 234 + + // computed check digit 3. + assert!(rrn("8001011112343")); + } + + #[test] + fn accepts_with_dash_separator() { + assert!(rrn("800101-1112343")); + } + + #[test] + fn rejects_invalid_month() { + assert!(!rrn("8013011112343")); + } + + #[test] + fn rejects_invalid_day() { + assert!(!rrn("8001321112343")); + } + + #[test] + fn rejects_invalid_region() { + assert!(!rrn("8001011962343")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!rrn("8001011112340")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!rrn("80010111123")); + assert!(!rrn("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/mod.rs b/crates/nvisy-pattern/src/validators/mod.rs index d833d6b8..9c00208d 100644 --- a/crates/nvisy-pattern/src/validators/mod.rs +++ b/crates/nvisy-pattern/src/validators/mod.rs @@ -11,11 +11,22 @@ //! ([`luhn`], [`iban`], [`phone`], [`date`], [`btc`]) plus //! jurisdiction-scoped sets re-exported from [`us`] (`"us.ssn"`, //! `"us.aba_routing"`, `"us.npi"`, `"us.dea_number"`, -//! `"us.postal_code"`) and [`uk`] +//! `"us.postal_code"`), [`uk`] //! (`"uk.nhs"`, `"uk.nino"`, `"uk.driving_licence"`, -//! `"uk.vehicle_registration"`). Each validator is also re-exported -//! as a free function so consumers can compose a custom registry -//! without taking the full set. +//! `"uk.vehicle_registration"`), [`de`] (`"de.bsnr"`, +//! `"de.lanr"`, `"de.passport"`, `"de.id_card"`, +//! `"de.health_insurance"`, `"de.social_security"`, +//! `"de.tax_id"`, `"de.vat_id"`, `"de.plz"`), [`es`] +//! (`"es.nif"`, `"es.nie"`, `"es.cif"`), [`it`] +//! (`"it.fiscal_code"`, `"it.vat_code"`), [`pl`] +//! (`"pl.pesel"`, `"pl.nip"`, `"pl.regon"`), [`au`] +//! (`"au.abn"`, `"au.acn"`, `"au.medicare"`, `"au.tfn"`), [`ca`] +//! (`"ca.sin"`), [`fi`] (`"fi.hetu"`), [`se`] +//! (`"se.personnummer"`, `"se.organisationsnummer"`), and the +//! India module (`"in.aadhaar"`, `"in.pan"`, `"in.gstin"`). +//! Each validator is also re-exported as a free function so +//! consumers can compose a custom registry without taking the +//! full set. //! //! [`Variant`]: crate::Variant //! [`Regex`]: crate::Regex @@ -25,7 +36,22 @@ mod date; mod iban; mod luhn; mod phone; +mod verhoeff; +pub mod au; +pub mod ca; +pub mod de; +pub mod es; +pub mod fi; +pub mod r#in; +pub mod it; +pub mod kr; +pub mod ng; +pub mod pl; +pub mod se; +pub mod sg; +pub mod th; +pub mod tr; pub mod uk; pub mod us; @@ -51,7 +77,7 @@ pub use self::phone::phone; /// guessing across a fixed fallback set. Validators that don't /// need either field can ignore it via `_ctx`. /// -/// [`RecognizerInput`]: crate::recognition::RecognizerInput +/// [`RecognizerInput`]: nvisy_core::recognition::RecognizerInput #[derive(Debug, Clone, Default)] pub struct ValidationContext { /// ISO 3166-1 alpha-2 jurisdiction associated with the input, @@ -116,6 +142,40 @@ impl ValidatorRegistry { /// /// UK-scoped: `"uk.nhs"`, `"uk.nino"`, /// `"uk.driving_licence"`, `"uk.vehicle_registration"`. + /// + /// DE-scoped: `"de.bsnr"`, `"de.lanr"`, `"de.passport"`, + /// `"de.id_card"`, `"de.health_insurance"`, + /// `"de.social_security"`, `"de.tax_id"`, `"de.vat_id"`, + /// `"de.plz"`. + /// + /// ES-scoped: `"es.nif"`, `"es.nie"`, `"es.cif"`. + /// + /// IT-scoped: `"it.fiscal_code"`, `"it.vat_code"`. + /// + /// PL-scoped: `"pl.pesel"`, `"pl.nip"`, `"pl.regon"`. + /// + /// AU-scoped: `"au.abn"`, `"au.acn"`, `"au.medicare"`, + /// `"au.tfn"`. + /// + /// CA-scoped: `"ca.sin"`. + /// + /// FI-scoped: `"fi.hetu"`. + /// + /// SE-scoped: `"se.personnummer"`, + /// `"se.organisationsnummer"`. + /// + /// IN-scoped: `"in.aadhaar"`, `"in.pan"`, `"in.gstin"`. + /// + /// KR-scoped: `"kr.rrn"`, `"kr.frn"`, `"kr.brn"`, + /// `"kr.driver_license"`. + /// + /// SG-scoped: `"sg.nric"`, `"sg.uen"`. + /// + /// TR-scoped: `"tr.tckn"`. + /// + /// NG-scoped: `"ng.nin"`. + /// + /// TH-scoped: `"th.national_id"`. #[must_use] pub fn builtin() -> Self { Self::empty() @@ -133,6 +193,43 @@ impl ValidatorRegistry { .with_simple("uk.nino", uk::nino) .with_simple("uk.driving_licence", uk::driving_licence) .with_simple("uk.vehicle_registration", uk::vehicle_registration) + .with_simple("de.bsnr", de::bsnr) + .with_simple("de.lanr", de::lanr) + .with_simple("de.passport", de::passport) + .with_simple("de.id_card", de::id_card) + .with_simple("de.health_insurance", de::health_insurance) + .with_simple("de.social_security", de::social_security) + .with_simple("de.tax_id", de::tax_id) + .with_simple("de.vat_id", de::vat_id) + .with_simple("de.plz", de::plz) + .with_simple("es.nif", es::nif) + .with_simple("es.nie", es::nie) + .with_simple("es.cif", es::cif) + .with_simple("it.fiscal_code", it::fiscal_code) + .with_simple("it.vat_code", it::vat_code) + .with_simple("pl.pesel", pl::pesel) + .with_simple("pl.nip", pl::nip) + .with_simple("pl.regon", pl::regon) + .with_simple("au.abn", au::abn) + .with_simple("au.acn", au::acn) + .with_simple("au.medicare", au::medicare) + .with_simple("au.tfn", au::tfn) + .with_simple("ca.sin", ca::sin) + .with_simple("fi.hetu", fi::hetu) + .with_simple("se.personnummer", se::personnummer) + .with_simple("se.organisationsnummer", se::organisationsnummer) + .with_simple("in.aadhaar", r#in::aadhaar) + .with_simple("in.pan", r#in::pan) + .with_simple("in.gstin", r#in::gstin) + .with_simple("kr.rrn", kr::rrn) + .with_simple("kr.frn", kr::frn) + .with_simple("kr.brn", kr::brn) + .with_simple("kr.driver_license", kr::driver_license) + .with_simple("sg.nric", sg::nric) + .with_simple("sg.uen", sg::uen) + .with_simple("tr.tckn", tr::tckn) + .with_simple("ng.nin", ng::nin) + .with_simple("th.national_id", th::national_id) } /// Register a context-aware `validator` under `name`, diff --git a/crates/nvisy-pattern/src/validators/ng/mod.rs b/crates/nvisy-pattern/src/validators/ng/mod.rs new file mode 100644 index 00000000..dad8029a --- /dev/null +++ b/crates/nvisy-pattern/src/validators/ng/mod.rs @@ -0,0 +1,10 @@ +//! Nigeria-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"ng.nin"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod nin; + +pub use self::nin::nin; diff --git a/crates/nvisy-pattern/src/validators/ng/nin.rs b/crates/nvisy-pattern/src/validators/ng/nin.rs new file mode 100644 index 00000000..50879c66 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/ng/nin.rs @@ -0,0 +1,59 @@ +//! Nigerian National Identification Number (NIN) validator. +//! +//! 11 digits issued by the National Identity Management +//! Commission (NIMC). The last digit is a Verhoeff checksum +//! over the preceding 10. + +use super::super::verhoeff::verhoeff; + +/// Return `true` when `value` is a valid 11-digit NIN. +pub fn nin(value: &str) -> bool { + let digits: String = value.chars().filter(|c| c.is_ascii_digit()).collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 11 || extras > 0 { + return false; + } + verhoeff(&digits) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_nin() { + // Body 1234567890 → Verhoeff check 2. + assert!(nin("12345678902")); + } + + #[test] + fn accepts_second_vector() { + assert!(nin("98765432102")); + } + + #[test] + fn accepts_with_separators() { + assert!(nin("1234 5678 902")); + assert!(nin("123-456-78902")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!nin("12345678900")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nin("1234567890")); + assert!(!nin("123456789033")); + assert!(!nin("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!nin("1234567890A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/pl/mod.rs b/crates/nvisy-pattern/src/validators/pl/mod.rs new file mode 100644 index 00000000..a34423de --- /dev/null +++ b/crates/nvisy-pattern/src/validators/pl/mod.rs @@ -0,0 +1,14 @@ +//! Poland-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"pl.pesel"`, `"pl.nip"`, `"pl.regon"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod nip; +mod pesel; +mod regon; + +pub use self::nip::nip; +pub use self::pesel::pesel; +pub use self::regon::regon; diff --git a/crates/nvisy-pattern/src/validators/pl/nip.rs b/crates/nvisy-pattern/src/validators/pl/nip.rs new file mode 100644 index 00000000..c94d945d --- /dev/null +++ b/crates/nvisy-pattern/src/validators/pl/nip.rs @@ -0,0 +1,65 @@ +//! Polish NIP (Numer Identyfikacji Podatkowej) validator. +//! +//! 10 digits. Check digit per Ustawa z dnia 13 października 1995 +//! r. o zasadach ewidencji i identyfikacji podatników: weights +//! `[6, 5, 7, 2, 3, 4, 5, 6, 7]` over the first 9 digits; +//! check = `sum mod 11`. A computed value of 10 means the NIP is +//! invalid (never assigned). + +/// Return `true` when `value` is a valid 10-digit NIP. Hyphen +/// and space separators in the conventional `XXX-XXX-XX-XX` or +/// `XXX-XX-XX-XXX` renderings are stripped before validation. +pub fn nip(value: &str) -> bool { + let chars: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + if chars.len() != 10 || !chars.iter().all(|c| c.is_ascii_digit()) { + return false; + } + let weights = [6, 5, 7, 2, 3, 4, 5, 6, 7]; + let sum: u32 = chars[..9] + .iter() + .zip(weights) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + let check = sum % 11; + if check == 10 { + return false; + } + check == chars[9].to_digit(10).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_nip() { + // 1060000062 — Ministerstwo Finansów reference NIP. + assert!(nip("1060000062")); + } + + #[test] + fn accepts_with_separators() { + assert!(nip("106-000-00-62")); + assert!(nip("106 000 00 62")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!nip("1060000060")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nip("106000006")); + assert!(!nip("10600000622")); + assert!(!nip("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!nip("106000006A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/pl/pesel.rs b/crates/nvisy-pattern/src/validators/pl/pesel.rs new file mode 100644 index 00000000..9ef95746 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/pl/pesel.rs @@ -0,0 +1,53 @@ +//! Polish PESEL (Powszechny Elektroniczny System Ewidencji +//! Ludności) checksum validator. +//! +//! 11 digits encoding birth date + sex + serial + check digit. +//! Check digit per the Ministerstwo Cyfryzacji spec: weights +//! `[1, 3, 7, 9, 1, 3, 7, 9, 1, 3]` over the first 10 digits; +//! check = `(10 - sum mod 10) mod 10`. + +/// Return `true` when `value` is a valid 11-digit PESEL. +pub fn pesel(value: &str) -> bool { + let chars: Vec = value.chars().filter(|c| !c.is_ascii_whitespace()).collect(); + if chars.len() != 11 || !chars.iter().all(|c| c.is_ascii_digit()) { + return false; + } + let weights = [1, 3, 7, 9, 1, 3, 7, 9, 1, 3]; + let sum: u32 = chars[..10] + .iter() + .zip(weights) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + let check = (10 - sum % 10) % 10; + check == chars[10].to_digit(10).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_pesel() { + // 44051401359 — widely-quoted example from official PESEL + // documentation: born 1944-05-14, male, serial 0135, + // check 9. + assert!(pesel("44051401359")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!pesel("44051401350")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!pesel("4405140135")); + assert!(!pesel("440514013599")); + assert!(!pesel("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!pesel("4405140135A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/pl/regon.rs b/crates/nvisy-pattern/src/validators/pl/regon.rs new file mode 100644 index 00000000..42bb2346 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/pl/regon.rs @@ -0,0 +1,80 @@ +//! Polish REGON (Rejestr Gospodarki Narodowej) validator. +//! +//! 9 digits (entity-level) or 14 digits (unit-level). Both use +//! a weighted-sum mod-11 check, with the 9-digit weights +//! `[8, 9, 2, 3, 4, 5, 6, 7]` and the 14-digit weights +//! `[2, 4, 8, 5, 0, 9, 7, 3, 6, 1, 2, 4, 8]`. A computed value +//! of 10 means the REGON is invalid (never assigned). + +const WEIGHTS_9: [u32; 8] = [8, 9, 2, 3, 4, 5, 6, 7]; +const WEIGHTS_14: [u32; 13] = [2, 4, 8, 5, 0, 9, 7, 3, 6, 1, 2, 4, 8]; + +/// Return `true` when `value` is a valid REGON in either the +/// 9-digit or 14-digit form. Hyphen and space separators are +/// stripped before validation. +pub fn regon(value: &str) -> bool { + let chars: Vec = value + .chars() + .filter(|c| !c.is_ascii_whitespace() && *c != '-') + .collect(); + if !chars.iter().all(|c| c.is_ascii_digit()) { + return false; + } + match chars.len() { + 9 => valid_with(&chars, &WEIGHTS_9), + 14 => valid_with(&chars[..9], &WEIGHTS_9) && valid_with(&chars, &WEIGHTS_14), + _ => false, + } +} + +fn valid_with(chars: &[char], weights: &[u32]) -> bool { + let body = &chars[..weights.len()]; + let check = chars[weights.len()]; + let sum: u32 = body + .iter() + .zip(weights) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + let computed = sum % 11 % 10; + computed == check.to_digit(10).unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_9() { + // 123456785 — widely-quoted valid REGON. + assert!(regon("123456785")); + } + + #[test] + fn accepts_canonical_14() { + // 12345678512347 — extends the 9-digit base with a valid + // 14-digit suffix check. + assert!(regon("12345678512347")); + } + + #[test] + fn accepts_with_separators() { + assert!(regon("123-456-785")); + } + + #[test] + fn rejects_wrong_checksum_9() { + assert!(!regon("123456789")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!regon("12345678")); + assert!(!regon("1234567890")); + assert!(!regon("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!regon("12345678A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/se/luhn.rs b/crates/nvisy-pattern/src/validators/se/luhn.rs new file mode 100644 index 00000000..19d52449 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/se/luhn.rs @@ -0,0 +1,18 @@ +//! Shared Luhn checksum for Swedish 10-digit identifiers. + +/// Return `true` when `digits` (exactly 10 decimal digits) pass +/// the Luhn checksum used by both personnummer and +/// organisationsnummer. +pub(super) fn luhn10(digits: &[u32]) -> bool { + let mut sum = 0u32; + let check = digits[9]; + for (i, d) in digits[..9].iter().rev().enumerate() { + if i.is_multiple_of(2) { + let m = d * 2; + sum += if m > 9 { m - 9 } else { m }; + } else { + sum += d; + } + } + (sum + check).is_multiple_of(10) +} diff --git a/crates/nvisy-pattern/src/validators/se/mod.rs b/crates/nvisy-pattern/src/validators/se/mod.rs new file mode 100644 index 00000000..67cb9f77 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/se/mod.rs @@ -0,0 +1,14 @@ +//! Sweden-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"se.personnummer"`, +//! `"se.organisationsnummer"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod luhn; +mod organisationsnummer; +mod personnummer; + +pub use self::organisationsnummer::organisationsnummer; +pub use self::personnummer::personnummer; diff --git a/crates/nvisy-pattern/src/validators/se/organisationsnummer.rs b/crates/nvisy-pattern/src/validators/se/organisationsnummer.rs new file mode 100644 index 00000000..1315525f --- /dev/null +++ b/crates/nvisy-pattern/src/validators/se/organisationsnummer.rs @@ -0,0 +1,62 @@ +//! Swedish Organisationsnummer validator. +//! +//! 10 digits with the third digit ≥ 2 (Bolagsverket's rule that +//! distinguishes organisationsnummer from personnummer) plus a +//! Luhn checksum over all 10. + +use super::luhn::luhn10; + +/// Return `true` when `value` is a valid organisationsnummer. +/// Hyphen separator is accepted; whitespace is ignored. +pub fn organisationsnummer(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if extras > 0 || digits.len() != 10 { + return false; + } + if digits[2] < 2 { + return false; + } + luhn10(&digits) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_orgnr() { + // 556677-1233 — public Bolagsverket example shape. + assert!(organisationsnummer("5566771233")); + assert!(organisationsnummer("556677-1233")); + } + + #[test] + fn accepts_second_vector() { + assert!(organisationsnummer("5522200004")); + } + + #[test] + fn rejects_low_third_digit() { + // Third digit 1 → looks like a personnummer, not an org. + assert!(!organisationsnummer("5516771233")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!organisationsnummer("5566771230")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!organisationsnummer("556677123")); + assert!(!organisationsnummer("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/se/personnummer.rs b/crates/nvisy-pattern/src/validators/se/personnummer.rs new file mode 100644 index 00000000..4f0320f3 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/se/personnummer.rs @@ -0,0 +1,94 @@ +//! Swedish Personnummer validator. +//! +//! Encoded as `YYMMDD-XXXX` (6-digit form) or `YYYYMMDD-XXXX` +//! (8-digit form, post-2000 cohort). Samordningsnummer adds 60 +//! to the day field. The trailing 10 digits carry a Luhn +//! checksum. + +use super::luhn::luhn10; + +/// Return `true` when `value` is a valid personnummer. Hyphen +/// `-` and plus `+` separators (the latter marks a 100-year-old +/// cohort) are accepted; whitespace is ignored. +pub fn personnummer(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-' && *c != '+') + .count(); + if extras > 0 || !matches!(digits.len(), 10 | 12) { + return false; + } + let pnr10: Vec = digits.into_iter().rev().take(10).rev().collect(); + + let month = pnr10[2] * 10 + pnr10[3]; + let raw_day = pnr10[4] * 10 + pnr10[5]; + let day = if raw_day >= 61 { raw_day - 60 } else { raw_day }; + if !(1..=12).contains(&month) || !(1..=31).contains(&day) { + return false; + } + + luhn10(&pnr10) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_personnummer() { + // 900101-1239 — 6-digit date form with valid Luhn. + assert!(personnummer("9001011239")); + assert!(personnummer("900101-1239")); + } + + #[test] + fn accepts_8_digit_form() { + // 19900101-1239 — 12-digit form; Luhn applies to last 10. + assert!(personnummer("199001011239")); + assert!(personnummer("19900101-1239")); + } + + #[test] + fn accepts_plus_separator() { + // `+` marks a 100-year-old cohort; structurally identical. + assert!(personnummer("900101+1239")); + } + + #[test] + fn accepts_samordningsnummer() { + // Day 61 = samordningsnummer for day 1. + // 900161 + 1234 → compute matching Luhn. + let body = "900161123"; + for last in 0..10 { + let s = format!("{body}{last}"); + let digits: Vec = s.chars().map(|c| c.to_digit(10).unwrap()).collect(); + if luhn10(&digits) { + assert!(personnummer(&s)); + return; + } + } + panic!("no valid samordningsnummer found for body {body}"); + } + + #[test] + fn rejects_invalid_date() { + // Month 13. + assert!(!personnummer("9013011230")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!personnummer("9001011230")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!personnummer("900101123")); + assert!(!personnummer("")); + } +} diff --git a/crates/nvisy-pattern/src/validators/sg/mod.rs b/crates/nvisy-pattern/src/validators/sg/mod.rs new file mode 100644 index 00000000..40c72607 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/sg/mod.rs @@ -0,0 +1,12 @@ +//! Singapore-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"sg.nric"`, `"sg.uen"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod nric; +mod uen; + +pub use self::nric::nric; +pub use self::uen::uen; diff --git a/crates/nvisy-pattern/src/validators/sg/nric.rs b/crates/nvisy-pattern/src/validators/sg/nric.rs new file mode 100644 index 00000000..989f878f --- /dev/null +++ b/crates/nvisy-pattern/src/validators/sg/nric.rs @@ -0,0 +1,115 @@ +//! Singapore NRIC / FIN checksum validator. +//! +//! 9 characters: prefix letter + 7 digits + check letter. +//! +//! Prefix: +//! - `S` — Singapore citizen / PR born before 2000 +//! - `T` — Singapore citizen / PR born 2000+ +//! - `F` — foreigner (Long-Term Pass) issued before 2000 +//! - `G` — foreigner issued 2000+ +//! - `M` — foreigner issued 2022+ (introduced after the original +//! NRIC spec; check digit uses a different table and offset). +//! +//! Algorithm: weighted sum of the 7 digits with +//! `[2, 7, 6, 5, 4, 3, 2]`, plus an offset of 4 for `T`/`G` +//! and 3 for `M`. The remainder mod 11 indexes into a +//! prefix-specific letter alphabet; for `M`, the table is read +//! at position `10 - r`. + +const ST_TABLE: &[u8; 11] = b"JZIHGFEDCBA"; +const FG_TABLE: &[u8; 11] = b"XWUTRQPNMLK"; +const M_TABLE: &[u8; 11] = b"KLJNPQRTUWX"; +const WEIGHTS: [u32; 7] = [2, 7, 6, 5, 4, 3, 2]; + +/// Return `true` when `value` is a valid 9-character NRIC/FIN. +pub fn nric(value: &str) -> bool { + let chars: Vec = value + .trim() + .chars() + .map(|c| c.to_ascii_uppercase()) + .collect(); + if chars.len() != 9 { + return false; + } + let prefix = chars[0]; + if !chars[1..8].iter().all(|c| c.is_ascii_digit()) || !chars[8].is_ascii_uppercase() { + return false; + } + let digits: String = chars[1..8].iter().collect(); + let mut sum: u32 = digits + .chars() + .zip(WEIGHTS) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + sum += match prefix { + 'T' | 'G' => 4, + 'M' => 3, + 'S' | 'F' => 0, + _ => return false, + }; + let r = (sum % 11) as usize; + let (table, idx) = match prefix { + 'S' | 'T' => (ST_TABLE, r), + 'F' | 'G' => (FG_TABLE, r), + 'M' => (M_TABLE, 10 - r), + _ => return false, + }; + table[idx] as char == chars[8] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_s_prefix() { + assert!(nric("S2740116C")); + } + + #[test] + fn accepts_t_prefix() { + assert!(nric("T1234567J")); + } + + #[test] + fn accepts_f_prefix() { + assert!(nric("F1234567N")); + } + + #[test] + fn accepts_g_prefix() { + assert!(nric("G1234567X")); + } + + #[test] + fn accepts_m_prefix() { + assert!(nric("M1234567K")); + } + + #[test] + fn accepts_lowercase_input() { + assert!(nric("s2740116c")); + } + + #[test] + fn rejects_unknown_prefix() { + assert!(!nric("A2740116C")); + } + + #[test] + fn rejects_wrong_check_letter() { + assert!(!nric("S2740116D")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!nric("S274011C")); + assert!(!nric("S27401166C")); + assert!(!nric("")); + } + + #[test] + fn rejects_non_digit_body() { + assert!(!nric("S274A116C")); + } +} diff --git a/crates/nvisy-pattern/src/validators/sg/uen.rs b/crates/nvisy-pattern/src/validators/sg/uen.rs new file mode 100644 index 00000000..de743135 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/sg/uen.rs @@ -0,0 +1,140 @@ +//! Singapore Unique Entity Number (UEN) validator. +//! +//! Three formats issued by ACRA: +//! +//! - Format A: 9 chars — 8 digits + check letter. +//! - Format B: 10 chars — 4-digit year of registration + 4 +//! digits + check letter. The year cannot be in the future. +//! - Format C: 10 chars — `T`/`S`/`R` prefix + 2 digits + 2-letter +//! entity-type code (from a fixed list) + 4 digits + check +//! letter. +//! +//! Each format uses its own weight vector and check-letter +//! alphabet. Format C's modulo arithmetic subtracts 5 before +//! taking `mod 11`. + +const FORMAT_A_WEIGHTS: [u32; 8] = [10, 4, 9, 3, 8, 2, 7, 1]; +const FORMAT_A_ALPHABET: &[u8; 11] = b"XMKECAWLJDB"; +const FORMAT_B_WEIGHTS: [u32; 9] = [10, 8, 6, 4, 9, 7, 5, 3, 1]; +const FORMAT_B_ALPHABET: &[u8; 11] = b"ZKCMDNERGWH"; +const FORMAT_C_WEIGHTS: [i64; 9] = [4, 3, 5, 3, 10, 2, 2, 5, 7]; +const FORMAT_C_ALPHABET: &str = "ABCDEFGHJKLMNPQRSTUVWX0123456789"; + +const FORMAT_C_PREFIXES: &str = "TSR"; +const FORMAT_C_ENTITY_TYPES: &[&str] = &[ + "LP", "LL", "FC", "PF", "RF", "MQ", "MM", "NB", "CC", "CS", "MB", "FM", "GS", "DP", "CP", "NR", + "CM", "CD", "MD", "HS", "VH", "CH", "MH", "CL", "XL", "CX", "HC", "RP", "TU", "TC", "FB", "FN", + "PA", "PB", "SS", "MC", "SM", "GA", "GB", +]; + +/// Return `true` when `value` is a valid UEN in any of the three +/// ACRA-published formats. +pub fn uen(value: &str) -> bool { + let normalized = value.trim().to_ascii_uppercase(); + let chars: Vec = normalized.chars().collect(); + match chars.len() { + 9 => validate_a(&chars), + 10 if chars[0].is_ascii_alphabetic() => validate_c(&normalized, &chars), + 10 => validate_b(&chars), + _ => false, + } +} + +fn validate_a(chars: &[char]) -> bool { + if !chars[..8].iter().all(|c| c.is_ascii_digit()) || !chars[8].is_ascii_uppercase() { + return false; + } + let sum: u32 = chars[..8] + .iter() + .zip(FORMAT_A_WEIGHTS) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + FORMAT_A_ALPHABET[(sum % 11) as usize] as char == chars[8] +} + +fn validate_b(chars: &[char]) -> bool { + if !chars[..9].iter().all(|c| c.is_ascii_digit()) || !chars[9].is_ascii_uppercase() { + return false; + } + let sum: u32 = chars[..9] + .iter() + .zip(FORMAT_B_WEIGHTS) + .map(|(c, w)| c.to_digit(10).unwrap() * w) + .sum(); + FORMAT_B_ALPHABET[(sum % 11) as usize] as char == chars[9] +} + +fn validate_c(normalized: &str, chars: &[char]) -> bool { + if !FORMAT_C_PREFIXES.contains(chars[0]) { + return false; + } + if !chars[1..3].iter().all(|c| c.is_ascii_digit()) { + return false; + } + let entity_type = &normalized[3..5]; + if !FORMAT_C_ENTITY_TYPES.contains(&entity_type) { + return false; + } + if !chars[5..9].iter().all(|c| c.is_ascii_digit()) || !chars[9].is_ascii_uppercase() { + return false; + } + let sum: i64 = chars[..9] + .iter() + .zip(FORMAT_C_WEIGHTS) + .map(|(c, w)| FORMAT_C_ALPHABET.find(*c).unwrap() as i64 * w) + .sum(); + let idx = (sum - 5).rem_euclid(11) as usize; + FORMAT_C_ALPHABET.as_bytes()[idx] as char == chars[9] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_format_a() { + // Body 12345678 → check `M`. + assert!(uen("12345678M")); + } + + #[test] + fn accepts_format_b() { + // Body 200512345 (year 2005) → check `R`. + assert!(uen("200512345R")); + } + + #[test] + fn accepts_format_c() { + // T05LL1234 (limited liability partnership) → check `D`. + assert!(uen("T05LL1234D")); + } + + #[test] + fn rejects_format_c_unknown_entity_type() { + // `ZZ` is not a valid entity type. + assert!(!uen("T05ZZ1234D")); + } + + #[test] + fn rejects_format_a_wrong_checksum() { + assert!(!uen("12345678A")); + } + + #[test] + fn rejects_format_b_wrong_checksum() { + assert!(!uen("200512345A")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!uen("1234567M")); + assert!(!uen("12345678MA")); + assert!(!uen("")); + } + + #[test] + fn rejects_format_c_wrong_prefix() { + // `X` is not a valid format-C prefix. + assert!(!uen("X05LL1234D")); + } +} diff --git a/crates/nvisy-pattern/src/validators/th/mod.rs b/crates/nvisy-pattern/src/validators/th/mod.rs new file mode 100644 index 00000000..d5344175 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/th/mod.rs @@ -0,0 +1,10 @@ +//! Thailand-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"th.national_id"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod national_id; + +pub use self::national_id::national_id; diff --git a/crates/nvisy-pattern/src/validators/th/national_id.rs b/crates/nvisy-pattern/src/validators/th/national_id.rs new file mode 100644 index 00000000..8235c2f5 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/th/national_id.rs @@ -0,0 +1,85 @@ +//! Thai National Identification Number (เลขประจำตัวประชาชน) +//! validator. +//! +//! 13 digits issued by the Department of Provincial +//! Administration (กรมการปกครอง). The first digit encodes the +//! citizenship category (1-8); 0 is reserved. The 13th digit is +//! a weighted checksum with weights `[13, 12, 11, 10, 9, 8, 7, +//! 6, 5, 4, 3, 2]` over the first 12 digits; check digit equals +//! `(11 - sum mod 11) mod 10`. + +const WEIGHTS: [u32; 12] = [13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]; + +/// Return `true` when `value` is a valid 13-digit Thai NID. +/// Whitespace and `-` separators in the rendering `X-XXXX-XXXXX-XX-X` +/// are stripped before validation. +pub fn national_id(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace() && *c != '-') + .count(); + if digits.len() != 13 || extras > 0 { + return false; + } + if !(1..=8).contains(&digits[0]) { + return false; + } + let sum: u32 = digits[..12].iter().zip(WEIGHTS).map(|(d, w)| d * w).sum(); + let check = (11 - (sum % 11)) % 10; + check == digits[12] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_nid() { + // Body 123456789012 → check 1. + assert!(national_id("1234567890121")); + } + + #[test] + fn accepts_second_vector() { + assert!(national_id("1000000000009")); + } + + #[test] + fn accepts_with_separators() { + assert!(national_id("1-2345-67890-12-1")); + assert!(national_id("1 2345 67890 12 1")); + } + + #[test] + fn rejects_reserved_leading_zero() { + assert!(!national_id("0234567890121")); + } + + #[test] + fn rejects_leading_nine() { + // Category 9 is not assigned. + assert!(!national_id("9234567890121")); + } + + #[test] + fn rejects_wrong_checksum() { + assert!(!national_id("1234567890120")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!national_id("123456789012")); + assert!(!national_id("12345678901211")); + assert!(!national_id("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!national_id("123456789012A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/tr/mod.rs b/crates/nvisy-pattern/src/validators/tr/mod.rs new file mode 100644 index 00000000..fdd2e64c --- /dev/null +++ b/crates/nvisy-pattern/src/validators/tr/mod.rs @@ -0,0 +1,10 @@ +//! Turkey-specific post-match validators. +//! +//! Registered under the [`ValidatorRegistry::builtin`] set with +//! dotted names — `"tr.tckn"`. +//! +//! [`ValidatorRegistry::builtin`]: super::ValidatorRegistry::builtin + +mod tckn; + +pub use self::tckn::tckn; diff --git a/crates/nvisy-pattern/src/validators/tr/tckn.rs b/crates/nvisy-pattern/src/validators/tr/tckn.rs new file mode 100644 index 00000000..4b2791a3 --- /dev/null +++ b/crates/nvisy-pattern/src/validators/tr/tckn.rs @@ -0,0 +1,86 @@ +//! Turkish T.C. Kimlik No (TCKN) validator. +//! +//! 11-digit national identifier issued by Nüfus ve Vatandaşlık +//! İşleri (NVI). The first digit cannot be zero. Two check +//! digits: +//! +//! - 10th = `(sum_odd_positions * 7 - sum_even_positions) mod 10` +//! (over the first 9 digits, 1-indexed odd/even). +//! - 11th = `sum_of_first_10 mod 10`. + +/// Return `true` when `value` is a valid 11-digit TCKN. +pub fn tckn(value: &str) -> bool { + let digits: Vec = value + .chars() + .filter(|c| c.is_ascii_digit()) + .map(|c| c.to_digit(10).unwrap()) + .collect(); + let extras = value + .chars() + .filter(|c| !c.is_ascii_digit() && !c.is_ascii_whitespace()) + .count(); + if digits.len() != 11 || extras > 0 { + return false; + } + if digits[0] == 0 { + return false; + } + + let odd_sum: i64 = (digits[0] + digits[2] + digits[4] + digits[6] + digits[8]) as i64; + let even_sum: i64 = (digits[1] + digits[3] + digits[5] + digits[7]) as i64; + let tenth = (odd_sum * 7 - even_sum).rem_euclid(10) as u32; + if tenth != digits[9] { + return false; + } + + let eleventh: u32 = digits[..10].iter().sum::() % 10; + eleventh == digits[10] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn accepts_canonical_tckn() { + // Body 123456789 → 10th = 5, 11th = 0. + assert!(tckn("12345678950")); + } + + #[test] + fn accepts_second_vector() { + assert!(tckn("10000000078")); + } + + #[test] + fn accepts_third_vector() { + assert!(tckn("98765432150")); + } + + #[test] + fn rejects_leading_zero() { + assert!(!tckn("02345678950")); + } + + #[test] + fn rejects_wrong_tenth_digit() { + assert!(!tckn("12345678900")); + } + + #[test] + fn rejects_wrong_eleventh_digit() { + assert!(!tckn("12345678955")); + } + + #[test] + fn rejects_wrong_length() { + assert!(!tckn("1234567895")); + assert!(!tckn("123456789500")); + assert!(!tckn("")); + } + + #[test] + fn rejects_non_digit() { + assert!(!tckn("1234567895A")); + } +} diff --git a/crates/nvisy-pattern/src/validators/verhoeff.rs b/crates/nvisy-pattern/src/validators/verhoeff.rs new file mode 100644 index 00000000..b2de8dec --- /dev/null +++ b/crates/nvisy-pattern/src/validators/verhoeff.rs @@ -0,0 +1,40 @@ +//! Verhoeff checksum tables and helper. +//! +//! Verhoeff is a dihedral-group-based checksum that detects all +//! single-digit and most transposition errors. Used by Indian +//! Aadhaar and Nigerian NIN. + +const D: [[u32; 10]; 10] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 2, 3, 4, 0, 6, 7, 8, 9, 5], + [2, 3, 4, 0, 1, 7, 8, 9, 5, 6], + [3, 4, 0, 1, 2, 8, 9, 5, 6, 7], + [4, 0, 1, 2, 3, 9, 5, 6, 7, 8], + [5, 9, 8, 7, 6, 0, 4, 3, 2, 1], + [6, 5, 9, 8, 7, 1, 0, 4, 3, 2], + [7, 6, 5, 9, 8, 2, 1, 0, 4, 3], + [8, 7, 6, 5, 9, 3, 2, 1, 0, 4], + [9, 8, 7, 6, 5, 4, 3, 2, 1, 0], +]; + +const P: [[u32; 10]; 8] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 5, 7, 6, 2, 8, 3, 0, 9, 4], + [5, 8, 0, 3, 7, 9, 6, 1, 4, 2], + [8, 9, 1, 6, 0, 4, 3, 5, 2, 7], + [9, 4, 5, 3, 1, 2, 6, 8, 7, 0], + [4, 2, 8, 6, 5, 7, 3, 9, 0, 1], + [2, 7, 9, 3, 8, 0, 6, 4, 1, 5], + [7, 0, 4, 6, 9, 1, 3, 2, 5, 8], +]; + +/// Return `true` when `digits` (already validated as ASCII +/// digits) passes the Verhoeff checksum. +pub(super) fn verhoeff(digits: &str) -> bool { + let mut c = 0u32; + for (i, ch) in digits.chars().rev().enumerate() { + let d = ch.to_digit(10).unwrap() as usize; + c = D[c as usize][P[i % 8][d] as usize]; + } + c == 0 +} diff --git a/crates/nvisy-pattern/testdata/inputs/au/finance.txt b/crates/nvisy-pattern/testdata/inputs/au/finance.txt new file mode 100644 index 00000000..54831542 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/au/finance.txt @@ -0,0 +1,8 @@ +Company filing for the Australian Business Register: + + Company name: Example Pty Ltd + Australian Business Number: 51 824 753 556 + Australian Company Number: 123 456 780 + Registered office: Sydney NSW + +The ABN above must appear on all tax invoices issued. diff --git a/crates/nvisy-pattern/testdata/inputs/au/health.txt b/crates/nvisy-pattern/testdata/inputs/au/health.txt new file mode 100644 index 00000000..9d05266e --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/au/health.txt @@ -0,0 +1,4 @@ +Patient handover for Mr. Doe — refer to the Medicare card on +file with Services Australia. + +Medicare number: 2228 12366 1 diff --git a/crates/nvisy-pattern/testdata/inputs/au/identity.txt b/crates/nvisy-pattern/testdata/inputs/au/identity.txt new file mode 100644 index 00000000..a5a907a3 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/au/identity.txt @@ -0,0 +1,4 @@ +Client record for John Smith — see the Australian Tax File +Number registered with the ATO. + +Tax File Number (TFN): 123 456 782 diff --git a/crates/nvisy-pattern/testdata/inputs/ca/contact.txt b/crates/nvisy-pattern/testdata/inputs/ca/contact.txt new file mode 100644 index 00000000..6027def6 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/ca/contact.txt @@ -0,0 +1,7 @@ +Mailing address for Canada Post delivery: + + 150 Elgin Street + Ottawa, ON K1P 1A1 + Canada + +Use the postal code above on the shipping label. diff --git a/crates/nvisy-pattern/testdata/inputs/ca/identity.txt b/crates/nvisy-pattern/testdata/inputs/ca/identity.txt new file mode 100644 index 00000000..b016ddae --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/ca/identity.txt @@ -0,0 +1,4 @@ +Client record for John Doe — see Social Insurance Number on +file with Service Canada. + +SIN: 123 456 782 diff --git a/crates/nvisy-pattern/testdata/inputs/de/contact.txt b/crates/nvisy-pattern/testdata/inputs/de/contact.txt new file mode 100644 index 00000000..1b72288f --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/de/contact.txt @@ -0,0 +1,7 @@ +Postanschrift für den Versand der Rechnung: + + Friedrichstraße 100 + 10117 Berlin + Deutschland + +Die Postleitzahl oben für das Versandlabel verwenden. diff --git a/crates/nvisy-pattern/testdata/inputs/de/finance.txt b/crates/nvisy-pattern/testdata/inputs/de/finance.txt new file mode 100644 index 00000000..7b595c63 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/de/finance.txt @@ -0,0 +1,8 @@ +Unternehmensdaten für die Gesellschaftsstammakte: + + Firma: Beispiel GmbH + Handelsregister: HRB 123456 + Registergericht: Amtsgericht Berlin-Charlottenburg + Sitz: Berlin + +Die Handelsregisternummer ist im Impressum aufzuführen. diff --git a/crates/nvisy-pattern/testdata/inputs/de/health.txt b/crates/nvisy-pattern/testdata/inputs/de/health.txt new file mode 100644 index 00000000..d047e7f4 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/de/health.txt @@ -0,0 +1,6 @@ +Patientenakte aus der Praxis Dr. Schmidt — siehe BSNR, LANR +und Krankenversichertennummer von der gesetzlichen Krankenkasse. + +Betriebsstättennummer (BSNR): 381789045 +Arztnummer (LANR): 123456601 +Krankenversichertennummer: A000500015 diff --git a/crates/nvisy-pattern/testdata/inputs/de/identity.txt b/crates/nvisy-pattern/testdata/inputs/de/identity.txt new file mode 100644 index 00000000..59e80639 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/de/identity.txt @@ -0,0 +1,9 @@ +Personenakte für Herrn Max Mustermann — siehe Personalausweis, +Reisepass, Steueridentifikationsnummer und USt-IdNr. + +Personalausweis (nPA): L01X00T44 +Reisepassnummer: C0J9H58P3 +Steueridentifikationsnummer: 65929970489 +Umsatzsteuer-IdNr: DE129273398 +Führerscheinnummer: B072RRE2I52 +Rentenversicherungsnummer: 15010685M016 diff --git a/crates/nvisy-pattern/testdata/inputs/de/vehicle.txt b/crates/nvisy-pattern/testdata/inputs/de/vehicle.txt new file mode 100644 index 00000000..0d7e819e --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/de/vehicle.txt @@ -0,0 +1,8 @@ +Fahrzeug im Bestand: + + Marke / Modell: BMW 3er + Amtliches Kennzeichen: B-AB 1234 + Fahrzeugschein: vorhanden + HU gültig: ja + +Die Versicherung ist auf das oben eingetragene Kennzeichen zu buchen. diff --git a/crates/nvisy-pattern/testdata/inputs/es/contact.txt b/crates/nvisy-pattern/testdata/inputs/es/contact.txt new file mode 100644 index 00000000..1230e906 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/es/contact.txt @@ -0,0 +1,7 @@ +Dirección postal para el envío de la factura: + + Calle Gran Vía 28 + 28013 Madrid + España + +Utilizar el código postal anterior para el etiquetado. diff --git a/crates/nvisy-pattern/testdata/inputs/es/finance.txt b/crates/nvisy-pattern/testdata/inputs/es/finance.txt new file mode 100644 index 00000000..16da5b80 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/es/finance.txt @@ -0,0 +1,7 @@ +Datos fiscales de la sociedad: + + Razón social: Ejemplo SL + Código de identificación fiscal (CIF): A12345674 + Domicilio fiscal: Madrid + +Incluir el CIF anterior en cualquier factura emitida. diff --git a/crates/nvisy-pattern/testdata/inputs/es/identity.txt b/crates/nvisy-pattern/testdata/inputs/es/identity.txt new file mode 100644 index 00000000..6b4add85 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/es/identity.txt @@ -0,0 +1,7 @@ +Expediente del cliente Juan García López — referenciar el DNI, el +número de identidad de extranjero (NIE) del conviviente, y el +pasaporte adjunto. + +DNI: 12345678Z +NIE (conviviente): X1234567L +Pasaporte: AAA123456 diff --git a/crates/nvisy-pattern/testdata/inputs/fi/identity.txt b/crates/nvisy-pattern/testdata/inputs/fi/identity.txt new file mode 100644 index 00000000..0d8873c2 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/fi/identity.txt @@ -0,0 +1,4 @@ +Asiakkaan tiedot — Matti Meikäläinen — katso +henkilötunnus alta. + +Henkilötunnus: 010170-123F diff --git a/crates/nvisy-pattern/testdata/inputs/in/finance.txt b/crates/nvisy-pattern/testdata/inputs/in/finance.txt new file mode 100644 index 00000000..35b2be53 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/in/finance.txt @@ -0,0 +1,8 @@ +GST registration for the entity: + + Legal name: Example Private Limited + GSTIN: 27AAAPL1234C1ZE + State: Maharashtra + PAN: AAAPL1234C + +The GSTIN above must be quoted on every tax invoice raised. diff --git a/crates/nvisy-pattern/testdata/inputs/in/identity.txt b/crates/nvisy-pattern/testdata/inputs/in/identity.txt new file mode 100644 index 00000000..e85df843 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/in/identity.txt @@ -0,0 +1,8 @@ +Client record for Rohit Sharma — see the Aadhaar number issued +by UIDAI, the PAN card on file with the Income Tax Department, +and the Indian passport. + +Aadhaar: 2341 2341 2346 +PAN: ABCPK1234E +Passport: M1234567 +Voter ID: ABC1234567 diff --git a/crates/nvisy-pattern/testdata/inputs/in/vehicle.txt b/crates/nvisy-pattern/testdata/inputs/in/vehicle.txt new file mode 100644 index 00000000..bcf9bc79 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/in/vehicle.txt @@ -0,0 +1,7 @@ +Vehicle on file with the Regional Transport Office: + + Make / model: Tata Nexon + Vehicle registration number: MH12AB1234 + RTO: Pune + +Insurance to be billed against the registration plate above. diff --git a/crates/nvisy-pattern/testdata/inputs/it/finance.txt b/crates/nvisy-pattern/testdata/inputs/it/finance.txt new file mode 100644 index 00000000..70fe03e8 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/it/finance.txt @@ -0,0 +1,7 @@ +Dati fiscali della società: + + Ragione sociale: Esempio S.r.l. + Partita IVA: 00154980569 + Sede legale: Milano + +Indicare la P.IVA su tutte le fatture emesse. diff --git a/crates/nvisy-pattern/testdata/inputs/it/identity.txt b/crates/nvisy-pattern/testdata/inputs/it/identity.txt new file mode 100644 index 00000000..4aa5da78 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/it/identity.txt @@ -0,0 +1,7 @@ +Pratica del cliente Mario Rossi — vedere il codice fiscale, la +carta d'identità elettronica e il passaporto allegati. + +Codice fiscale: RSSMRA70A01H501S +Carta d'identità: CA12345AB +Passaporto: YA1234567 +Patente di guida: MI1234567A diff --git a/crates/nvisy-pattern/testdata/inputs/kr/finance.txt b/crates/nvisy-pattern/testdata/inputs/kr/finance.txt new file mode 100644 index 00000000..214f9ac9 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/kr/finance.txt @@ -0,0 +1,7 @@ +사업자 정보: + + 법인명: 예시 주식회사 + 사업자등록번호: 123-45-67891 + 주소: 서울특별시 강남구 + +위 사업자번호는 모든 세금계산서에 표기해야 합니다. diff --git a/crates/nvisy-pattern/testdata/inputs/kr/identity.txt b/crates/nvisy-pattern/testdata/inputs/kr/identity.txt new file mode 100644 index 00000000..d6673332 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/kr/identity.txt @@ -0,0 +1,7 @@ +고객 기록 — 김민수 — 주민등록번호와 한국 여권을 +참조하십시오. + +주민등록번호: 800101-1112343 +외국인등록번호: 900101-5112344 +여권번호: M123N4567 +운전면허번호: 11-20-123456-78 diff --git a/crates/nvisy-pattern/testdata/inputs/ng/identity.txt b/crates/nvisy-pattern/testdata/inputs/ng/identity.txt new file mode 100644 index 00000000..81fae4fd --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/ng/identity.txt @@ -0,0 +1,4 @@ +Client record for Mr. Adekunle Okafor — see the National +Identification Number issued by NIMC. + +NIN: 12345678902 diff --git a/crates/nvisy-pattern/testdata/inputs/ng/vehicle.txt b/crates/nvisy-pattern/testdata/inputs/ng/vehicle.txt new file mode 100644 index 00000000..32f6d115 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/ng/vehicle.txt @@ -0,0 +1,7 @@ +Vehicle registered with the Federal Road Safety Corps (FRSC): + + Make / model: Toyota Camry + Plate number: ABC-123DE + LGA: Ikeja + +Insurance billed against the plate above. diff --git a/crates/nvisy-pattern/testdata/inputs/pl/contact.txt b/crates/nvisy-pattern/testdata/inputs/pl/contact.txt new file mode 100644 index 00000000..a32c0184 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/pl/contact.txt @@ -0,0 +1,7 @@ +Adres pocztowy do wysyłki faktury: + + ul. Marszałkowska 100 + 00-001 Warszawa + Polska + +Użyj powyższego kodu pocztowego na etykiecie wysyłkowej. diff --git a/crates/nvisy-pattern/testdata/inputs/pl/finance.txt b/crates/nvisy-pattern/testdata/inputs/pl/finance.txt new file mode 100644 index 00000000..3a208877 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/pl/finance.txt @@ -0,0 +1,8 @@ +Dane podatnika spółki: + + Nazwa firmy: Przykład Sp. z o.o. + NIP: 106-000-00-62 + REGON: 123456785 + Siedziba: Warszawa + +Numer NIP należy podawać na wszystkich fakturach. diff --git a/crates/nvisy-pattern/testdata/inputs/pl/identity.txt b/crates/nvisy-pattern/testdata/inputs/pl/identity.txt new file mode 100644 index 00000000..b6dce254 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/pl/identity.txt @@ -0,0 +1,5 @@ +Akta klienta Jan Kowalski — sprawdź numer PESEL i numer ewidencyjny +osoby na utrzymaniu. + +Numer PESEL: 44051401359 +Numer ewidencyjny: 78010112345 diff --git a/crates/nvisy-pattern/testdata/inputs/se/contact.txt b/crates/nvisy-pattern/testdata/inputs/se/contact.txt new file mode 100644 index 00000000..04298821 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/se/contact.txt @@ -0,0 +1,7 @@ +Postadress för faktura: + + Drottninggatan 100 + 111 60 Stockholm + Sverige + +Använd postnumret ovan på fraktetiketten. diff --git a/crates/nvisy-pattern/testdata/inputs/se/finance.txt b/crates/nvisy-pattern/testdata/inputs/se/finance.txt new file mode 100644 index 00000000..317e7bf5 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/se/finance.txt @@ -0,0 +1,7 @@ +Företagsuppgifter för bolagsregistret: + + Företagsnamn: Exempel AB + Organisationsnummer: 556677-1233 + Säte: Stockholm + +Organisationsnumret ska anges på alla fakturor. diff --git a/crates/nvisy-pattern/testdata/inputs/se/identity.txt b/crates/nvisy-pattern/testdata/inputs/se/identity.txt new file mode 100644 index 00000000..d4419940 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/se/identity.txt @@ -0,0 +1,4 @@ +Klientuppgift för Anders Andersson — se personnummer enligt +Skatteverkets registrering. + +Personnummer: 900101-1239 diff --git a/crates/nvisy-pattern/testdata/inputs/sg/contact.txt b/crates/nvisy-pattern/testdata/inputs/sg/contact.txt new file mode 100644 index 00000000..354ee260 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/sg/contact.txt @@ -0,0 +1,6 @@ +Mailing address for SingPost delivery: + + 1 Marina Bay Drive + Singapore 018989 + +Use the postal code above on the shipping label. diff --git a/crates/nvisy-pattern/testdata/inputs/sg/finance.txt b/crates/nvisy-pattern/testdata/inputs/sg/finance.txt new file mode 100644 index 00000000..b1c35662 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/sg/finance.txt @@ -0,0 +1,7 @@ +Company filing with ACRA: + + Entity name: Example Pte Ltd + Unique Entity Number (UEN): 200512345R + Business address: Singapore + +The UEN above must be quoted on tax invoices. diff --git a/crates/nvisy-pattern/testdata/inputs/sg/identity.txt b/crates/nvisy-pattern/testdata/inputs/sg/identity.txt new file mode 100644 index 00000000..0551fe5c --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/sg/identity.txt @@ -0,0 +1,4 @@ +Client record for Mr. Tan Wei Ming — see NRIC issued by ICA. + +NRIC: S2740116C +FIN: F1234567N diff --git a/crates/nvisy-pattern/testdata/inputs/th/contact.txt b/crates/nvisy-pattern/testdata/inputs/th/contact.txt new file mode 100644 index 00000000..e3ce1f7c --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/th/contact.txt @@ -0,0 +1,7 @@ +ที่อยู่จัดส่งสำหรับไปรษณีย์ไทย: + + ถนนสีลม 100 + กรุงเทพมหานคร 10500 + ประเทศไทย + +ใช้รหัสไปรษณีย์ด้านบนบนฉลากจัดส่ง diff --git a/crates/nvisy-pattern/testdata/inputs/th/identity.txt b/crates/nvisy-pattern/testdata/inputs/th/identity.txt new file mode 100644 index 00000000..1231cbc8 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/th/identity.txt @@ -0,0 +1,3 @@ +บันทึกลูกค้า — สมชาย ใจดี — ดูเลขประจำตัวประชาชนด้านล่าง + +เลขประจำตัวประชาชน: 1-2345-67890-12-1 diff --git a/crates/nvisy-pattern/testdata/inputs/tr/contact.txt b/crates/nvisy-pattern/testdata/inputs/tr/contact.txt new file mode 100644 index 00000000..ee964a14 --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/tr/contact.txt @@ -0,0 +1,7 @@ +PTT teslimatı için posta adresi: + + Atatürk Caddesi 100 + 34000 İstanbul + Türkiye + +Yukarıdaki posta kodunu kargo etiketinde kullanın. diff --git a/crates/nvisy-pattern/testdata/inputs/tr/identity.txt b/crates/nvisy-pattern/testdata/inputs/tr/identity.txt new file mode 100644 index 00000000..3b611ccc --- /dev/null +++ b/crates/nvisy-pattern/testdata/inputs/tr/identity.txt @@ -0,0 +1,4 @@ +Müşteri kaydı — Ahmet Yılmaz — TCKN ve plaka aşağıdadır. + +TC Kimlik No: 12345678950 +Plaka: 34-ABC-1234 diff --git a/crates/nvisy-pattern/tests/builtin_au.rs b/crates/nvisy-pattern/tests/builtin_au.rs new file mode 100644 index 00000000..2d79528c --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_au.rs @@ -0,0 +1,52 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! AU-jurisdiction fixtures (`testdata/inputs/au/.txt`). +//! +//! Each test scans one AU fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Australian document of that domain is +//! expected to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/au/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "123 456 782", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/au/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "51 824 753 556", + ); + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "123 456 780", + ); + assert_label_present(&entities, builtins::COMPANY_ID.label_ref()); +} + +#[tokio::test] +async fn builtin_health() { + let (text, entities) = scan(include_str!("../testdata/inputs/au/health.txt")).await; + assert_match( + &text, + &entities, + builtins::INSURANCE_ID.label_ref(), + "2228 12366 1", + ); +} diff --git a/crates/nvisy-pattern/tests/builtin_ca.rs b/crates/nvisy-pattern/tests/builtin_ca.rs new file mode 100644 index 00000000..7860b3cb --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_ca.rs @@ -0,0 +1,35 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! CA-jurisdiction fixtures (`testdata/inputs/ca/.txt`). +//! +//! Each test scans one CA fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Canadian document of that domain is expected +//! to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/ca/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "123 456 782", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/ca/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "K1P 1A1", + ); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_de.rs b/crates/nvisy-pattern/tests/builtin_de.rs new file mode 100644 index 00000000..94336762 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_de.rs @@ -0,0 +1,114 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! DE-jurisdiction fixtures (`testdata/inputs/de/.txt`). +//! +//! Each test scans one DE fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real German document of that domain is expected to +//! surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/de/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "L01X00T44", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "C0J9H58P3", + ); + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "65929970489", + ); + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "DE129273398", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "B072RRE2I52", + ); + assert_match( + &text, + &entities, + builtins::NATIONAL_INSURANCE_NUMBER.label_ref(), + "15010685M016", + ); +} + +#[tokio::test] +async fn builtin_health() { + let (text, entities) = scan(include_str!("../testdata/inputs/de/health.txt")).await; + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "381789045", + ); + assert_match( + &text, + &entities, + builtins::MEDICAL_ID.label_ref(), + "123456601", + ); + assert_match( + &text, + &entities, + builtins::INSURANCE_ID.label_ref(), + "A000500015", + ); +} + +#[tokio::test] +async fn builtin_vehicle() { + let (text, entities) = scan(include_str!("../testdata/inputs/de/vehicle.txt")).await; + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "B-AB 1234", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/de/contact.txt")).await; + assert_match(&text, &entities, builtins::POSTAL_CODE.label_ref(), "10117"); + // English-language nationality dictionary stays silent on a + // German document — assert it didn't fire. + assert!( + !entities + .iter() + .any(|e| e.label == builtins::NATIONALITY.label_ref()), + "english-language NATIONALITY dictionary should not match on a DE fixture", + ); + // Sanity: at least one PLZ entity surfaced. + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/de/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "HRB 123456", + ); +} diff --git a/crates/nvisy-pattern/tests/builtin_es.rs b/crates/nvisy-pattern/tests/builtin_es.rs new file mode 100644 index 00000000..ecf671b5 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_es.rs @@ -0,0 +1,61 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! ES-jurisdiction fixtures (`testdata/inputs/es/.txt`). +//! +//! Each test scans one ES fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Spanish document of that domain is expected +//! to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/es/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "12345678Z", + ); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "X1234567L", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "AAA123456", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/es/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "A12345674", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/es/contact.txt")).await; + assert_match(&text, &entities, builtins::POSTAL_CODE.label_ref(), "28013"); + // English-language nationality dictionary stays silent on a + // Spanish document — assert it didn't fire. + assert!( + !entities + .iter() + .any(|e| e.label == builtins::NATIONALITY.label_ref()), + "english-language NATIONALITY dictionary should not match on an ES fixture", + ); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_fi.rs b/crates/nvisy-pattern/tests/builtin_fi.rs new file mode 100644 index 00000000..5488a5a3 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_fi.rs @@ -0,0 +1,19 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! FI-jurisdiction fixtures (`testdata/inputs/fi/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/fi/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "010170-123F", + ); + assert_label_present(&entities, builtins::GOVERNMENT_ID.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_in.rs b/crates/nvisy-pattern/tests/builtin_in.rs new file mode 100644 index 00000000..fd7057f2 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_in.rs @@ -0,0 +1,60 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! IN-jurisdiction fixtures (`testdata/inputs/in/.txt`). +//! +//! Each test scans one IN fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Indian document of that domain is expected +//! to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/in/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "2341 2341 2346", + ); + assert_match(&text, &entities, builtins::TAX_ID.label_ref(), "ABCPK1234E"); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "M1234567", + ); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "ABC1234567", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/in/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "27AAAPL1234C1ZE", + ); + assert_match(&text, &entities, builtins::TAX_ID.label_ref(), "AAAPL1234C"); +} + +#[tokio::test] +async fn builtin_vehicle() { + let (text, entities) = scan(include_str!("../testdata/inputs/in/vehicle.txt")).await; + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "MH12AB1234", + ); + assert_label_present(&entities, builtins::LICENSE_PLATE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_it.rs b/crates/nvisy-pattern/tests/builtin_it.rs new file mode 100644 index 00000000..eeb54c2b --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_it.rs @@ -0,0 +1,61 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! IT-jurisdiction fixtures (`testdata/inputs/it/.txt`). +//! +//! Each test scans one IT fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Italian document of that domain is expected +//! to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/it/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "RSSMRA70A01H501S", + ); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "CA12345AB", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "YA1234567", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "MI1234567A", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/it/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "00154980569", + ); + // English-language nationality dictionary stays silent on an + // Italian document — assert it didn't fire. + assert!( + !entities + .iter() + .any(|e| e.label == builtins::NATIONALITY.label_ref()), + "english-language NATIONALITY dictionary should not match on an IT fixture", + ); + assert_label_present(&entities, builtins::TAX_ID.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_kr.rs b/crates/nvisy-pattern/tests/builtin_kr.rs new file mode 100644 index 00000000..919e4355 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_kr.rs @@ -0,0 +1,48 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! KR-jurisdiction fixtures (`testdata/inputs/kr/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/kr/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "800101-1112343", + ); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "900101-5112344", + ); + assert_match( + &text, + &entities, + builtins::PASSPORT_NUMBER.label_ref(), + "M123N4567", + ); + assert_match( + &text, + &entities, + builtins::DRIVERS_LICENSE.label_ref(), + "11-20-123456-78", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/kr/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "123-45-67891", + ); + assert_label_present(&entities, builtins::COMPANY_ID.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_ng.rs b/crates/nvisy-pattern/tests/builtin_ng.rs new file mode 100644 index 00000000..088f7def --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_ng.rs @@ -0,0 +1,30 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! NG-jurisdiction fixtures (`testdata/inputs/ng/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/ng/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "12345678902", + ); +} + +#[tokio::test] +async fn builtin_vehicle() { + let (text, entities) = scan(include_str!("../testdata/inputs/ng/vehicle.txt")).await; + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "ABC-123DE", + ); + assert_label_present(&entities, builtins::LICENSE_PLATE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_pl.rs b/crates/nvisy-pattern/tests/builtin_pl.rs new file mode 100644 index 00000000..b1489db1 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_pl.rs @@ -0,0 +1,60 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! PL-jurisdiction fixtures (`testdata/inputs/pl/.txt`). +//! +//! Each test scans one PL fixture through a recognizer wired +//! with every shipped pattern and dictionary, then asserts the +//! entities a real Polish document of that domain is expected +//! to surface (substring + label). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/pl/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "44051401359", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/pl/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::TAX_ID.label_ref(), + "106-000-00-62", + ); + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "123456785", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/pl/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "00-001", + ); + // English-language nationality dictionary stays silent on a + // Polish document — assert it didn't fire. + assert!( + !entities + .iter() + .any(|e| e.label == builtins::NATIONALITY.label_ref()), + "english-language NATIONALITY dictionary should not match on a PL fixture", + ); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_se.rs b/crates/nvisy-pattern/tests/builtin_se.rs new file mode 100644 index 00000000..9cf741b4 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_se.rs @@ -0,0 +1,41 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! SE-jurisdiction fixtures (`testdata/inputs/se/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/se/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "900101-1239", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/se/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "556677-1233", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/se/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "111 60", + ); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_sg.rs b/crates/nvisy-pattern/tests/builtin_sg.rs new file mode 100644 index 00000000..83542c8d --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_sg.rs @@ -0,0 +1,47 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! SG-jurisdiction fixtures (`testdata/inputs/sg/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/sg/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "S2740116C", + ); + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "F1234567N", + ); +} + +#[tokio::test] +async fn builtin_finance() { + let (text, entities) = scan(include_str!("../testdata/inputs/sg/finance.txt")).await; + assert_match( + &text, + &entities, + builtins::COMPANY_ID.label_ref(), + "200512345R", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/sg/contact.txt")).await; + assert_match( + &text, + &entities, + builtins::POSTAL_CODE.label_ref(), + "018989", + ); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_th.rs b/crates/nvisy-pattern/tests/builtin_th.rs new file mode 100644 index 00000000..2ce212df --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_th.rs @@ -0,0 +1,25 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! TH-jurisdiction fixtures (`testdata/inputs/th/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/th/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "1-2345-67890-12-1", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/th/contact.txt")).await; + assert_match(&text, &entities, builtins::POSTAL_CODE.label_ref(), "10500"); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +} diff --git a/crates/nvisy-pattern/tests/builtin_tr.rs b/crates/nvisy-pattern/tests/builtin_tr.rs new file mode 100644 index 00000000..64b47400 --- /dev/null +++ b/crates/nvisy-pattern/tests/builtin_tr.rs @@ -0,0 +1,31 @@ +//! End-to-end: shipped patterns + dictionaries against the +//! TR-jurisdiction fixtures (`testdata/inputs/tr/.txt`). + +mod fixtures; + +use fixtures::{assert_label_present, assert_match, scan}; +use nvisy_core::entity::builtins; + +#[tokio::test] +async fn builtin_identity() { + let (text, entities) = scan(include_str!("../testdata/inputs/tr/identity.txt")).await; + assert_match( + &text, + &entities, + builtins::GOVERNMENT_ID.label_ref(), + "12345678950", + ); + assert_match( + &text, + &entities, + builtins::LICENSE_PLATE.label_ref(), + "34-ABC-1234", + ); +} + +#[tokio::test] +async fn builtin_contact() { + let (text, entities) = scan(include_str!("../testdata/inputs/tr/contact.txt")).await; + assert_match(&text, &entities, builtins::POSTAL_CODE.label_ref(), "34000"); + assert_label_present(&entities, builtins::POSTAL_CODE.label_ref()); +}