diff --git a/README.md b/README.md index 627c6bf2..dae6f504 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,11 @@ await pipeline.run(); npm Project RDF into engine-agnostic search documents (framing + a declarative field spec) + + @lde/search-api-graphql + npm + Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from any SearchType at runtime + @lde/search-typesense npm @@ -229,6 +234,10 @@ graph TD subgraph Publication fastify-rdf docgen + search --> text-normalization + search-api-graphql --> search + search-typesense --> search + search-typesense --> text-normalization end subgraph Monitoring diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 8189cda5..df74737c 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,10 +6,8 @@ Date: 2026-06-25 Proposed -Reconciles against the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`), which are themselves -a **draft under discussion**, so several decisions below are deliberate deviations from -the current draft, to be reconciled back into it. +Aligned with the NDE [stack platform docs](https://docs.nde.nl/stack/layers/platform); the +decisions below are reflected there. ## Context @@ -19,10 +17,9 @@ declarative source so the GraphQL surface, a later REST surface, and the index c from each other, and so a deployment can swap search engines without consumers noticing. That requires an engine- and protocol-neutral **core** that both API surfaces and any -engine adapter sit on. The platform draft frames this as Ports & Adapters with a framed -JSON-LD intermediate representation, generated from SHACL + a `search:` annotation -vocabulary. We adopt that direction but scope it to what a v1 keyword search needs, and -diverge on a few concrete points where the draft does not fit DR’s catalog-search case. +engine adapter sit on. The architecture is Ports & Adapters with a framed JSON-LD +intermediate representation, generated from SHACL + a `search:` annotation vocabulary, +scoped here to what a v1 keyword search needs. ## Decision @@ -32,26 +29,31 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | Tier | Package | Responsibility | | ----------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · adapter port | +| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · engine port | | backend | `@lde/search-typesense` | engine adapter: collection schema · query/filter compiler · `search()` | | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | -This deviates from the draft’s function-mapping table (`@lde/graphql-server`, -`@lde/rest-server`, no core row); the draft should adopt the `@lde/search*` family. - ### Contract frozen, storage swappable The **API contract** (the SDL shape consumers couple to) is breaking to change and must be right in v1. The **IR / stored document** (framed JSON-LD vs a flat engine doc) lives behind the adapter and is swappable with no consumer impact. Nothing engine-specific (companion fields, `int32`, the engine query language) and nothing RDF-specific -(`@context`, `@id`, IRI-keyed predicates) leaks past the adapter port. +(`@context`, `@id`, IRI-keyed predicates) leaks past the engine port. ### Field model The engine-neutral description of a queryable field – the runtime form of one SHACL -NodeShape + its `search:` annotations: +NodeShape + its `search:` annotations. **One `SearchField` declaration drives four +consumers** – projection (RDF→flat document), the engine collection schema, the query +semantics, and the GraphQL surface – so they cannot drift. + +It is a **unified** model: one declaration in place of three otherwise-separate ones – the +projection-side `FieldSpec`/`FieldKind`, the Typesense `SEARCH_FIELDS` (collection schema + +weights), and the query model below. `kind` plus capability flags replace the discriminated +projection kinds, derived fields are first-class, and the Typesense-vocabulary types are +_derived_ from `kind` rather than re-declared. ```ts type FieldKind = @@ -64,31 +66,48 @@ type FieldKind = | 'reference'; interface SearchField { - readonly name: string; // logical API name + readonly name: string; // logical API name; the physical fanout derives from it readonly kind: FieldKind; - readonly array?: boolean; - readonly localized?: boolean; + readonly path?: string; // sh:path to project from; omit for a derivation-populated field + readonly array?: boolean; // sh:maxCount + readonly required?: boolean; // sh:minCount ≥ 1 — non-null in output, non-optional in the index + readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) + readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type - readonly searchable?: { weight: number }; // free-text inclusion + weight + readonly searchable?: { weight: number }; // free-text inclusion + weight (per-locale when localized) readonly filterable?: boolean; // usable in `where` readonly facetable?: boolean; readonly sortable?: boolean; - readonly nestedStrategy?: 'labelOnly' | 'idOnly' | 'inline'; // for `reference` - readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta + readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' + readonly transform?: (value: string) => string; // projection-time value transform + readonly facetRanges?: readonly FacetRange[]; // numeric facet: fixed [min, max) range bins (histogram) vs per-value buckets } -interface SearchSchema { +type Derivation = (document: SearchDocument, node: FramedNode) => void; + +// One root type (one SHACL NodeShape); a whole deployment’s declaration is the +// SearchSchema, a map of SearchTypes keyed by type IRI (built with searchSchema()). +interface SearchType { + readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; // computed fields: status, booleans } ``` -Maps onto SHACL + `search:` (`kind`←`sh:datatype`, `array`←`sh:maxCount`, -`localized`←`sh:languageIn`, `facetable`←`search:facetable`, `sortable`←`search:sortable`, -`nestedStrategy`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an eventual generator -emits it unchanged. The `group` companion (coarse grouped facets, e.g. `format_group`) and -the `status_rank` tie-break sort are **deployment-specific deltas**, never in `@lde/search`. -`relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a -generic reserved sort the adapter understands. +Maps onto SHACL + `search:` (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, +`array`←`sh:maxCount`, `localized`←`sh:languageIn`, `facetable`←`search:facetable`, +`sortable`←`search:sortable`, `ref`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an +eventual generator emits it unchanged. A field with **no `path`** is a derived field – +populated by a `Derivation` rather than projected from the IR – yet it still carries full +query/schema/output behavior, which is how the former separate projection `FieldSpec` is +subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, +`${name}_sort_${locale}`, `${name}_search`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `status_rank` +tie-break sort is a **deployment-specific delta**, never in `@lde/search`. Grouped facets need +no field-model mechanism at all: a deployment derivation materializes group tokens (e.g. +`group:rdf`) into the field’s own values – see Consequences. `relevance` is _not_ a delta: +every full-text engine ranks by match score, so it is a generic reserved sort the adapter +understands. ### `SearchQuery` – the neutral query IR @@ -147,25 +166,82 @@ variable-based clients (`$o: DatasetOrderBy`) break, so a future array is a deli **Inclusive bounds only** – `min`/`max`, no `gt`/`gte`/`lt`/`lte`: self-documenting, matches Typesense’s native inclusive range, covers every DR case, additively reversible. -Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` -strings and the adapter splits/unions them. - -### Adapter port and result +A numeric facet returns **range buckets** (`[min, max)` bins declared per field); the adapter +maps them to the engine’s native range faceting. + +**Grouped facets need no special engine mechanism; they are denormalized at index time.** +A coarse category alongside granular values (e.g. `group:rdf` next to media types, `group:person` +next to class IRIs) is materialized into the field’s own values during projection, so at query +time a group token is an ordinary value: faceted natively, filtered by plain membership +(`field.in: ["group:rdf"]` unions with granular values for free), and — where the field is +`output` – read like any other value. There is no `_group` companion, no `group:`-prefix split, +no filter rewriting in the adapter; the engine stays dumb and denormalization (the document +store’s strength) does the work. A cross-source signal that is not a subset of the field (e.g. a +SPARQL capability derived from `conformsTo`, not a media type) is likewise materialized as a plain +value by a deployment derivation. + +The trade-off this design accepts: **group membership is fixed at index time.** Because the +group token is baked into each document’s values during projection, redefining a group (which +granular values map to `group:rdf`) is an index-data change that takes effect only on **reindex** – +there is no query-time mapping to edit. The constraint is acceptable here because group definitions +are deployment projection config that already drives indexing, and reindexing is already the +pipeline’s job; it would not suit a system where grouping is user-defined or changes frequently. + +### Engine port and result + +The **port** is the interface the core defines; a concrete engine **adapter** +(`@lde/search-typesense`’s `TypesenseSearchEngine`) implements it. Naming the port for the +capability (`SearchEngine`), not the pattern piece, keeps `TypesenseSearchEngine implements +SearchEngine` readable. ```ts -interface SearchAdapter { - search(query: SearchQuery, schema: SearchSchema): Promise; +// FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them +// to its type’s facetable / output field names for typo-safe facet and document access +// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + searchType: SearchType, + ): Promise>; } -interface SearchResult { - readonly hits: readonly { id: string; document: SearchDocument }[]; +interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; readonly total: number; + // Keyed by facet field name; `Partial` because only the queried facets are present. + // A bucket’s `label` (a LocalizedValue) is the engine-resolved canonical data label, + // present only for reference (IRI-keyed) facets; absent for token/free-string facets, + // whose display the consumer owns (its own i18n, or the value itself). readonly facets: Readonly< - Record + Partial< + Record< + FacetField, + readonly { value: string; count: number; label?: LocalizedValue }[] + > + > >; } -type SearchDocument = Record; +// `id` (the stable document key, an IRI) stays out of the document: it is the hit’s +// identity, always present, a different contract from the optional logical field values, +// and maps straight onto the GraphQL output’s `id: String!`. +interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +// The logical result document. Named distinctly from the flat, fanned-out projection +// `SearchDocument` that lives index-side: this carries logical fields (language maps, +// references) ready for a surface to shape. +type ResultDocument = Readonly< + Partial> +>; type SearchValue = | string | number @@ -192,7 +268,7 @@ per-shape types (e.g. `Organization`, `Term`) with `label` exposed as `name` - **IR / adapter-return:** JSON-LD language map (`@container: @language`), `@set` arrays, `und` for untagged. Matches schema-profile #171 (language maps are more usable as a data - model) and the platform draft’s envelope. + model) and the stack platform envelope. - **GraphQL surface:** a single **best-first** `Accept-Language`-ordered list (`[LanguageString!]!`, see [ADR 4](./0004-search-api-graphql-surface.md)). `[0]` is the value to display; **`[0].language` is the language actually served** – the per-field @@ -208,7 +284,7 @@ argument (deferred): a parallel arg would duplicate the header and need preceden Chosen over a `{nl,en}` map (silently yields `undefined` for a missing language, no defined fallback order) and over a separate resolved scalar (the value must be a `LanguageString` to carry its language anyway, so the scalar saved only the `[0]` index – not worth a second -field plus a deviation from the draft / Network-of-Terms list shape). Grounded in measured +field plus diverging from the Network-of-Terms list shape). Grounded in measured data and all three substrates: - **A (descriptions, measured):** bilingual `nl`/`en`, ~86% Dutch-only → an English user gets @@ -222,32 +298,26 @@ have an English title) is distinct from content `dct:language` (already filterab preference; expressible as a facetable dimension (languages-present-in-a-localized-field), not enabled for DR v1, more relevant for B/C. -### Other reconciled decisions +### Other decisions - **Numbered pagination** (`offset`/`limit`, presented as page/per-page), not Relay cursors. DR is a page-numbered faceted browser with totals; Typesense is natively page/per-page; the ~2,500-doc corpus never paginates deep enough for offset cost to bite; and the blue/green alias swap removes the mutation-drift that motivates cursors. - **Sidecar canonical labels**, not inline `labelOnly` as default. Facets need one - canonical label per entity; the draft’s own two-source model puts canonical labels in a - separate collection, which is what DR’s `labels` collection is. `nestedStrategy` is - carried as metadata but inline `labelOnly` is not the default. -- **Logical typed result document** at the query seam; framed JSON-LD kept index-side. The - draft treats framed JSON-LD as the universal IR; we scope it to the index/projection - artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not catalog-search’s), - gated on the generic framing packages existing rather than on DR. + canonical label per entity, kept in a separate collection — DR’s `labels` collection. A + reference’s `strategy` is carried as metadata; `labelOnly` is the v1 default, not inline. +- **Logical typed result document** at the query seam; framed JSON-LD kept index-side as the + index/projection artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not + catalog-search’s), gated on the generic framing packages existing rather than on DR. ## Consequences - One declarative source drives GraphQL, later REST, and the index; they cannot drift. - The engine is a swappable adapter; the contract outlives engine choices. -- Adopted from the draft unchanged: the Stable API Contract discipline, `nestedStrategy` as - a concept, the surface `LanguageString` list, folding at the adapter boundary + query - side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; - logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the - `@lde/search*` naming and a core package row. +- Carried through: the Stable API Contract discipline, the reference `strategy` concept, the + surface `LanguageString` list, folding at the adapter boundary + query side via + `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) - search; unifying the projection `FieldSpec` (RDF→doc) with this `SearchField` - (query/output) into one field declaration. + search. diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index d6aff824..678d6d04 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,42 +11,34 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The platform draft requires the surface to be derived from the same -source as the index, never hand-written, so it cannot drift. It must also be framework-free: -resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server -can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, -is a separate package). +API surface is GraphQL, derived from the same source as the index so it cannot drift. It must +be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any +GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is a deferred +separate package). ## Decision ### Runtime configuration, not code generation -The platform draft frames this as _generating_ the surface – emitting GraphQL SDL **and** -resolvers as artifacts. We deviate: nothing is emitted or committed. The schema is -**constructed at runtime from the field-model configuration** (`buildSearchSchema(config)`), -once at startup, and the resolvers are **generic functions inside the package** attached to -that schema. A better name for the draft’s “generation” step, at least for this surface, is -**runtime configuration**. - -This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the adapter, and maps the result back; -the field model only parameterises data. Codegen would emit N near-identical resolver stubs +The surface is **constructed at runtime from the field-model configuration** +(`buildGraphQLSchema(config)`), once at startup, with generic resolvers shipped in the package +attached to that schema – nothing is emitted or committed. The resolvers are inherently +generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result +back; the field model only parameterises data), so codegen would emit N near-identical stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. -**No SDL artifact.** A live GraphQL API serves its own schema via introspection, so clients -need no committed `.graphql` file. The field-model diff is the reviewable change. A -`printSchema()` helper exists only as an **optional** CI snapshot test for catching -accidental breaking changes to the frozen contract – not a shipped artifact. - -> Deviation from the stack draft: the draft’s “generate SDL + resolvers” becomes -> _construct the schema at runtime from configuration; resolvers are generic and in-package; -> SDL is served live via introspection, not emitted._ For the reconciliation list. +A live GraphQL API serves its own schema via introspection, so clients need no committed +`.graphql` file; the field-model diff is the reviewable change. `printGraphQLSchema()` exists +only as an **optional** CI snapshot test guarding the frozen contract against accidental +breaking changes – not a shipped artifact. ### The schema-building function ```ts -function buildSearchSchema( - schema: SearchSchema, +// Generic over the config *value’s* type (capture it `as const satisfies SearchType`), so +// one declaration drives both the runtime schema and the static TS types below. +function buildGraphQLSchema( + schema: S, options: { typeName: string; // 'Dataset' – drives all derived type names queryField?: string; // root field; default lowercased plural of typeName @@ -60,30 +52,69 @@ function buildSearchSchema( }, ): GraphQLSchema; // executable schema: types + generic resolvers attached +// Static types derived from the SAME config value’s type (compile-time only, erased at +// runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. +type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } +type WhereOf; // { format?: StringFilter; size?: FloatRange; … } +type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } +type FacetOf; // the facetable-field-name union + // also exported for manual composition / non-default servers: function buildSearchTypeDefsAndResolvers( schema, options, ): { typeDefs: string; resolvers: object }; // optional CI helper only: -function printSearchSchema(schema, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test ``` -`buildSearchSchema` is the standalone, framework-agnostic artifact (depends only on +`buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on `graphql` + `@graphql-tools/schema`). Deep customisation passes `extendTypeDefs`/ `extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or composes the exported typeDefs/resolvers by hand. +### A typed surface the contract does not depend on + +One `as const satisfies SearchType` declaration drives two **independent** projections: the +**runtime contract** (the `GraphQLSchema`, built at startup by reading the value – +`field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / +`WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). + +The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only +and erased, so the served schema is byte-identical whether or not the mirror exists – it is a +developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type +mapping lives in `buildGraphQLSchema`; the type-level mapping in `OutputOf` duplicates it), +so the **contract** is guarded by the optional `printGraphQLSchema()` SDL snapshot (the real +artifact), while the TS mirror only catches our own coding mistakes against it. + +Values are typed at both ends, with the resolver as the typed transform between them: + +| layer | localized text | reference | int64 | keyword (array) | boolean | +| ----------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | +| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | +| GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | + +What stays unchecked is only the generic resolver’s **dynamic middle**: it loops over the +field model with runtime-string names, so TS cannot prove the object it builds matches +`OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the +output types at runtime (a wrong-typed return raises a field error). Same “typed boundaries, +dynamic middle” shape as the engine port and the projection: type the edges where it is +honest, accept a cast where iteration is inherently dynamic. + ### Construction rules (field model → schema) -Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, -`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once. +Type names derive from `typeName`; shared types (`LanguageString`, `ValueBucket`, `RangeBucket`, +`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once, and the +per-type keyed facets object is named `Facets`. GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); - `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int`; `number` → `Float`; - `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → - see below. Nullability from `array` / required / optional; `id` is `String!`. + `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); + `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → + `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / + optional; `id` is `String!`. A magnitude that can exceed 32 bits (a 64-bit count or byte size + – e.g. DR’s `size`) is `number` → `Float`, since `Int` would overflow; a `Long`/`BigInt` + custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -95,9 +126,9 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). | `inline` (later) | the named type plus the referenced shape’s projected fields | So DR emits `publisher: Organization` (the `foaf:Agent` shape) and - `terminologySource: [Term!]!`; a shape’s type is emitted once and reused by any field that - references it. Named, not a generic GraphQL `Reference`: going `labelOnly → inline` then - only _adds_ fields (non-breaking), whereas generic→named later would break the contract. + `terminologySource: [Term!]!`. Named, not a generic GraphQL `Reference`: going + `labelOnly → inline` then only _adds_ fields (non-breaking), whereas generic→named later + would break the contract. - **`where` input** – one field per `filterable` field: `keyword`/`reference` → `StringFilter { in: [String!] }`; `integer` → `IntRange { min, max }`; `number` → @@ -105,14 +136,32 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). `is` value); `text` is excluded (it goes through the `query` arg). - **`orderBy`** – `RELEVANCE` (the sane default when a `query` is present) plus every `sortable` field, as an enum, in a single `{ field, direction }` input. Only - publicly-selectable sorts appear here; the resolver expands the client’s one choice into - the internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via - `queryDefaults` (never exposed). Single for now because a user picks one dimension. - Promoting it to a list later is backward-compatible only for inline-literal clients (list - input coercion wraps a single value); **variable-based clients break** (`$o: DatasetOrderBy` - is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, - potentially breaking change – not a free one. -- **Facets** – an enum of every `facetable` field; requested per query, returned with counts. + publicly-selectable sorts appear; the resolver expands the client’s one choice into the + internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via + `queryDefaults` (never exposed). Single for now because a user picks one dimension; promoting + it to a list later is backward-compatible only for inline-literal clients (list input + coercion) – **variable-based clients break** (`$o: DatasetOrderBy` where `[DatasetOrderBy!]` + is expected) – so a future array is a deliberate, potentially breaking change. +- **Facets** – a **keyed object** (`Facets`), one field per `facetable` field, typed by + the field’s kind: a numeric range-facet field is `[RangeBucket!]!`, every other facet is + `[ValueBucket!]!`. The facet set and each bucket shape are thus encoded **statically in the + schema**, not discovered at runtime through an enum + polymorphic bucket (no `__typename`, no + fragments). **Selection is the request**: only the facet keys a query selects are computed + (the resolver inspects the selection), each with its **own where-filter removed** + (skip-own-filter – a multi-select facet still lists its other options; dropping a `status` + filter also drops the valid-only default, so the status facet counts across every status). + Two bucket types: + - `ValueBucket { value, count, label }` – `value` is the selection key (filter via + `field.in`); `label` (nullable) is the engine-resolved canonical **data** label, present + only for **reference** (IRI-keyed) facets, `null` for token/free-string facets whose + display the consumer owns (its i18n for controlled tokens like `valid` → “Geldig”/“Valid”, + or the `value` itself). The null is load-bearing. + - `RangeBucket { min, max, count }` – a half-open `[min, max)` numeric bin (`max` null on an + open-ended top bin), filtered via `field.range`. + - A grouped facet (a coarse category alongside granular values, e.g. `group:rdf` next to media + types) needs **no special bucket**: its tokens are denormalized into the field at index time, + so they are ordinary `ValueBucket` values – faceted, filtered (`field.in: ["group:rdf"]`) and, + where output, read like any other value (see ADR 0003). ### Resulting schema (DR example, abridged) @@ -137,35 +186,23 @@ type Dataset { publisher: Organization terminologySource: [Term!]! format: [String!]! - class: [String!]! - size: Int + size: Float # int64 magnitude → Float, not Int (32-bit) datePosted: String status: String iiif: Boolean! # … keyword, language, iiifManifestCount, ndeSchemaAp, linkedData, terms, persistentUris } -input StringFilter { - in: [String!] -} -input IntRange { - min: Int - max: Int -} -input DateRange { - min: String - max: String -} +# shared inputs are emitted once and reused: DR uses StringFilter + FloatRange + +# SortDirection (IntRange / DateRange are pruned – no filterable int/date field). input DatasetWhere { publisher: StringFilter format: StringFilter class: StringFilter status: StringFilter - size: IntRange - datePosted: DateRange - iiif: Boolean - # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris + size: FloatRange + # … keyword, language, terminologySource, catalog } enum DatasetSortField { @@ -174,36 +211,31 @@ enum DatasetSortField { DATE_POSTED SIZE } -enum SortDirection { - ASC - DESC -} input DatasetOrderBy { field: DatasetSortField! direction: SortDirection! = DESC } -enum DatasetFacetField { - PUBLISHER - KEYWORD - LANGUAGE - FORMAT - CLASS - TERMINOLOGY_SOURCE - STATUS - IIIF - NDE_SCHEMA_AP - LINKED_DATA - TERMS - PERSISTENT_URIS +type ValueBucket { + value: String! # selection key: a media type, a token (group:rdf), or an IRI for reference facets + count: Int! + label: [LanguageString!] # nullable; resolved data label for reference facets, else null } -type FacetBucket { - value: String! +type RangeBucket { + min: Float # half-open [min, max); max null = open-ended top bin + max: Float count: Int! } -type Facet { - field: DatasetFacetField! - buckets: [FacetBucket!]! +type DatasetFacets { + # one field per facetable field, typed by kind; selection = request, skip-own-filter applied + publisher: [ValueBucket!]! + keyword: [ValueBucket!]! + language: [ValueBucket!]! + format: [ValueBucket!]! + class: [ValueBucket!]! + terminologySource: [ValueBucket!]! + status: [ValueBucket!]! + size: [RangeBucket!]! } type DatasetSearchResult { @@ -211,7 +243,7 @@ type DatasetSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: DatasetFacets! } type Query { @@ -220,19 +252,20 @@ type Query { where: DatasetWhere orderBy: DatasetOrderBy page: Int = 1 - perPage: Int = 20 - facets: [DatasetFacetField!] + perPage: Int = 20 # no `facets` arg – selecting facet keys IS the request ): DatasetSearchResult! } ``` Numbered pagination (`page`/`perPage` + `total`), per [ADR 3](./0003-search-api-core-query-model.md) – no Relay connection. The reference types -(`Organization`, `Term`) carry `id + name` (labelOnly) from DR’s sidecar labels collection, -resolved by the adapter. `publisher` is single (`dct:publisher` `maxCount 1`); `creator` is -search-only – its name feeds full-text `query` but it has no output field of its own, -mirroring the current card. `catalog` is filter-only, so it appears in `where` but not as an -output field. +carry `id + name` (labelOnly) from DR’s sidecar labels collection, resolved by the adapter. +`publisher` is single (`dct:publisher` `maxCount 1`); `creator` is search-only (its name feeds +full-text `query` but it has no output field); `catalog` is filter-only (in `where`, not output); +`class` is facet + filter but not output (its `group:` tokens surface only as facet buckets, never +as card values); `datePosted` is sortable + output only; and the NDE compatibility booleans +(`iiif`, `ndeSchemaAp`, `linkedData`, `terms`) are output-only vinkjes – in neither `where` nor the +facets until “filter by vinkje” ships. ### Resolver behaviour @@ -241,37 +274,33 @@ The single, generic root resolver (shipped in the package, not emitted): 1. **Args → `SearchQuery`** (pure): `query`→`text`; `where`→`Filter[]`; `orderBy`→`Sort[]` (`RELEVANCE`→reserved `relevance`); `page`/`perPage`→`offset`/`limit`; `facets`→logical names; `locale`←`context.acceptLanguage[0]`. -2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; - DR injects its policy here: default `status:=valid`; default sort `relevance` when a - `query` is present else `title`; and the `status_rank` tie-break appended to either. -3. **`context.adapter.search(query, schema)` → `SearchResult`.** +2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR + injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is + present else `title`; and the `status_rank` tie-break appended to either. +3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → - `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; - reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. + `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference + values likewise; facets keyed logical→enum. GraphQL field selection prunes. -Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, -then untagged (`und`) last – so `[0]` is always the best available value. +Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, then +untagged (`und`) last – so `[0]` is always the best available value. ### Lifecycle and performance -- **Built once at startup.** The consumer calls `buildSearchSchema` during boot and hands - the single `GraphQLSchema` to its server; the field model is static per deployment, so it - is never rebuilt per request. -- **Held and reused.** That one schema serves every request (Mercurius additionally - caches/compiles it). -- **Zero per-request penalty vs codegen.** A runtime-constructed schema is the same - `GraphQLSchema` object codegen would have produced; the only added cost is the one-time - build, sub-millisecond to low-single-digit-ms for a schema this size. +- **Built once at startup, reused for every request.** The field model is static per + deployment, so the single `GraphQLSchema` is constructed during boot (sub-millisecond to + low-single-digit-ms for a schema this size) and never rebuilt per request – the same object + codegen would have produced, with no per-request penalty (Mercurius additionally caches it). - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. -- **Introspection serves the contract.** Cheap (a query against the built schema, cached by - clients). Leave it on, or disable in production and use `printSearchSchema` for tooling. +- **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in + production and use `printGraphQLSchema` for tooling. ### Context contract ```ts interface SearchContext { - adapter: SearchAdapter; // any engine + engine: SearchEngine; // the port; any engine adapter acceptLanguage: readonly string[]; // parsed, ordered; drives locale + output ordering } ``` @@ -281,21 +310,18 @@ Each transport populates it per request; no framework type appears in the packag ## Consequences - The GraphQL surface is configured at runtime from the - [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the - index or a later REST surface, and works under any GraphQL server. + [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the index + or a later REST surface, and works under any GraphQL server. - **Frozen (public contract):** `LanguageString`, the named reference types (`Organization`, `Term`, …), output types, `where` operators, `orderBy` enums, numbered-pagination args, facet types. Breaking to change – right in v1. -- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes - facets, the `SearchDocument` shape. -- **Deviations to reconcile into the platform draft:** - - “generate SDL + resolvers” → _runtime configuration_ (construct at startup from config; - generic in-package resolvers; SDL served via introspection, not emitted as an artifact). - - Named reference types per shape (`Organization`, `Term`) rather than the draft’s uniform - `labelOnly` `{ @id, @type, name }` reference shape – chosen for ergonomics and - additive `inline` growth. -- Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail - stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`); transport-layer persisted queries / cost - limits; a root or per-field language argument (Accept-Language is the sole preference - mechanism); metadata-language-availability filtering (a facetable dimension, not v1). +- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, + the `SearchDocument` shape. +- **Named reference types** per shape rather than one uniform reference type – chosen for + ergonomics and additive `inline` growth (`labelOnly` → `inline` only adds fields). +- Deferred: a `dataset(id)` single-resource query (DR detail stays on SPARQL); cross-collection + `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO + `String`) and a `Long`/`BigInt` scalar for 64-bit integers (kept `Float`); transport-layer + persisted queries / cost limits; a root or per-field language argument (Accept-Language is the + sole preference mechanism); metadata-language-availability filtering (a facetable dimension, + not v1). diff --git a/package-lock.json b/package-lock.json index e1a4b8d8..6abec10e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24953,6 +24953,10 @@ "resolved": "packages/search", "link": true }, + "node_modules/@lde/search-api-graphql": { + "resolved": "packages/search-api-graphql", + "link": true + }, "node_modules/@lde/search-typesense": { "resolved": "packages/search-typesense", "link": true @@ -33133,7 +33137,6 @@ "version": "15.10.2", "resolved": "https://registry.npmjs.org/graphql/-/graphql-15.10.2.tgz", "integrity": "sha512-1PRqdDPAmViWr4h1GVBT8RoPZfWSGZa7kDzleTilOfVIslsgf+cia3Nl95v1KDmR4iERPaT7WzQ+tN4MJmbg3w==", - "dev": true, "license": "MIT", "engines": { "node": ">= 10.x" @@ -41008,7 +41011,7 @@ }, "packages/dataset": { "name": "@lde/dataset", - "version": "0.7.7", + "version": "0.7.8", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -41016,10 +41019,10 @@ }, "packages/dataset-registry-client": { "name": "@lde/dataset-registry-client", - "version": "0.8.4", + "version": "0.8.5", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "@traqula/generator-sparql-1-1": "^1.1.6", "@traqula/parser-sparql-1-1": "^1.1.5", "@traqula/rules-sparql-1-1": "^1.1.0", @@ -41032,34 +41035,34 @@ }, "packages/distribution-downloader": { "name": "@lde/distribution-downloader", - "version": "0.6.5", + "version": "0.6.6", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "filenamify-url": "4.0.0", "tslib": "^2.3.0" } }, "packages/distribution-health": { "name": "@lde/distribution-health", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/distribution-probe": "0.2.1", - "@lde/sparql-importer": "0.6.5", + "@lde/distribution-probe": "^0.2.2", + "@lde/sparql-importer": "^0.6.5", "tslib": "^2.3.0" }, "devDependencies": { - "@lde/dataset": "0.7.7" + "@lde/dataset": "^0.7.7" } }, "packages/distribution-monitor": { "name": "@lde/distribution-monitor", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-probe": "0.2.1", + "@lde/dataset": "^0.7.8", + "@lde/distribution-probe": "^0.2.2", "c12": "^3.3.4", "commander": "^15.0.0", "cron": "^4.1.0", @@ -41086,10 +41089,10 @@ }, "packages/distribution-probe": { "name": "@lde/distribution-probe", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "rdf-parse": "^5.0.0", "tslib": "^2.3.0" } @@ -41756,7 +41759,7 @@ }, "packages/docgen": { "name": "@lde/docgen", - "version": "0.6.18", + "version": "0.6.19", "license": "MIT", "dependencies": { "@tpluscode/rdf-ns-builders": "^5.0.0", @@ -41786,7 +41789,7 @@ }, "packages/fastify-rdf": { "name": "@lde/fastify-rdf", - "version": "0.4.6", + "version": "0.4.7", "license": "MIT", "dependencies": { "@fastify/accepts": "^5.0.0", @@ -42483,7 +42486,7 @@ }, "packages/iiif-validator": { "name": "@lde/iiif-validator", - "version": "0.1.4", + "version": "0.1.5", "license": "MIT", "dependencies": { "@iiif/parser": "^2.2.10", @@ -42492,7 +42495,7 @@ }, "packages/local-sparql-endpoint": { "name": "@lde/local-sparql-endpoint", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { "jest-dev-server": "11.0.0", @@ -42505,15 +42508,15 @@ }, "packages/pipeline": { "name": "@lde/pipeline", - "version": "0.31.3", + "version": "0.31.5", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/dataset-registry-client": "0.8.4", - "@lde/distribution-health": "0.2.1", - "@lde/distribution-probe": "0.2.1", - "@lde/sparql-importer": "0.6.5", - "@lde/sparql-server": "0.4.11", + "@lde/dataset": "^0.7.7", + "@lde/dataset-registry-client": "^0.8.4", + "@lde/distribution-health": "^0.2.2", + "@lde/distribution-probe": "^0.2.2", + "@lde/sparql-importer": "^0.6.5", + "@lde/sparql-server": "^0.4.11", "@rdfjs/namespace": "^2.0.1", "@rdfjs/types": "^2.0.1", "@tpluscode/rdf-ns-builders": "^5.0.0", @@ -42531,7 +42534,7 @@ }, "packages/pipeline-console-reporter": { "name": "@lde/pipeline-console-reporter", - "version": "0.22.3", + "version": "0.22.5", "license": "MIT", "dependencies": { "chalk": "^5.4.1", @@ -42541,8 +42544,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-console-reporter/node_modules/ansi-regex": { @@ -42722,7 +42725,7 @@ }, "packages/pipeline-shacl-sampler": { "name": "@lde/pipeline-shacl-sampler", - "version": "0.5.3", + "version": "0.5.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42732,8 +42735,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-shacl-sampler/node_modules/n3": { @@ -42751,7 +42754,7 @@ }, "packages/pipeline-shacl-validator": { "name": "@lde/pipeline-shacl-validator", - "version": "0.13.3", + "version": "0.13.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42764,8 +42767,8 @@ "n3": "^2.1.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-shacl-validator/node_modules/n3": { @@ -42784,7 +42787,7 @@ }, "packages/pipeline-void": { "name": "@lde/pipeline-void", - "version": "0.29.3", + "version": "0.29.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42794,8 +42797,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-void/node_modules/n3": { @@ -42852,10 +42855,10 @@ }, "packages/search": { "name": "@lde/search", - "version": "0.1.1", + "version": "0.1.2", "license": "MIT", "dependencies": { - "@lde/text-normalization": "0.1.0", + "@lde/text-normalization": "^0.1.1", "@rdfjs/types": "^2.0.1", "@tpluscode/rdf-ns-builders": "^5.0.0", "jsonld": "^9.0.0", @@ -42866,11 +42869,23 @@ "n3": "^2.1.0" } }, + "packages/search-api-graphql": { + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } + }, "packages/search-typesense": { "name": "@lde/search-typesense", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, @@ -42894,28 +42909,28 @@ }, "packages/sparql-importer": { "name": "@lde/sparql-importer", - "version": "0.6.5", + "version": "0.6.6", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-downloader": "0.6.5", - "@lde/task-runner": "0.2.11", + "@lde/dataset": "^0.7.8", + "@lde/distribution-downloader": "^0.6.5", + "@lde/task-runner": "^0.2.11", "tslib": "^2.3.0" } }, "packages/sparql-qlever": { "name": "@lde/sparql-qlever", - "version": "0.14.10", + "version": "0.14.11", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-downloader": "0.6.5", - "@lde/sparql-importer": "0.6.5", - "@lde/sparql-server": "0.4.11", - "@lde/task-runner": "0.2.11", - "@lde/task-runner-docker": "0.2.13", - "@lde/task-runner-native": "0.2.14", - "@lde/wait-for-sparql": "0.2.13", + "@lde/dataset": "^0.7.8", + "@lde/distribution-downloader": "^0.6.5", + "@lde/sparql-importer": "^0.6.5", + "@lde/sparql-server": "^0.4.11", + "@lde/task-runner": "^0.2.11", + "@lde/task-runner-docker": "^0.2.13", + "@lde/task-runner-native": "^0.2.14", + "@lde/wait-for-sparql": "^0.2.13", "rdf-parse": "^5.0.0", "rdf-serialize": "^5.1.0", "tslib": "^2.3.0", @@ -43620,7 +43635,7 @@ }, "packages/sparql-server": { "name": "@lde/sparql-server", - "version": "0.4.11", + "version": "0.4.12", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43628,7 +43643,7 @@ }, "packages/task-runner": { "name": "@lde/task-runner", - "version": "0.2.11", + "version": "0.2.12", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43636,10 +43651,10 @@ }, "packages/task-runner-docker": { "name": "@lde/task-runner-docker", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { - "@lde/task-runner": "0.2.11", + "@lde/task-runner": "^0.2.12", "dockerode": "^5.0.1", "tslib": "^2.3.0" }, @@ -43649,16 +43664,16 @@ }, "packages/task-runner-native": { "name": "@lde/task-runner-native", - "version": "0.2.14", + "version": "0.2.15", "license": "MIT", "dependencies": { - "@lde/task-runner": "0.2.11", + "@lde/task-runner": "^0.2.12", "tslib": "^2.3.0" } }, "packages/text-normalization": { "name": "@lde/text-normalization", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43666,7 +43681,7 @@ }, "packages/wait-for-sparql": { "name": "@lde/wait-for-sparql", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { "fetch-sparql-endpoint": "^7.1.1", diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md new file mode 100644 index 00000000..d6274a9d --- /dev/null +++ b/packages/search-api-graphql/README.md @@ -0,0 +1,56 @@ +# @lde/search-api-graphql + +The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and +domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchType` +at runtime, and serves it with one generic resolver over any `SearchEngine`. It +names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, +`CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it +[`@lde/search-typesense`](../search-typesense) or another adapter). + +## Runtime configuration, not codegen + +`buildGraphQLSchema(searchType, { typeName })` constructs the schema once at +startup from the field model — no SDL artifact, no generated resolver stubs. The +field model is the single source; the GraphQL contract is whatever it produces. +Output types, the `where`/`orderBy`/facet inputs, reference types and nullability +are all derived from each field’s `kind` and capability flags. + +```ts +import { buildGraphQLSchema } from '@lde/search-api-graphql'; + +const gqlSchema = buildGraphQLSchema(DATASET, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + }), +}); + +// Hand `gqlSchema` to any graphql-js server; populate the per-request context: +// { engine: SearchEngine, acceptLanguage: string[] } +``` + +## What it builds + +- **Output type** (`typeName`) — localized text → best-first `[LanguageString!]!` + (`[0].language` is the language actually served); references → named per-shape + types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` + → ISO 8601 string; nullability from `required` / `array` / `kind`. +- **`where`** — one input per `filterable` field (`StringFilter`, `IntRange` / + `FloatRange` / `DateRange`, or `Boolean`). +- **`orderBy`** — `RELEVANCE` plus every `sortable` field, as an enum. +- **Facets** — an enum of every `facetable` field; a bucket carries `value` + + `count` + a nullable `label` — the resolved data label for **reference** facets, + `null` for token/free-string facets whose display the consumer owns (its own + i18n, or the value itself). + +## Why it can’t drift + +The surface reads the same field model the index is built from, and compiles into +the same neutral `SearchQuery` the engine consumes — so the API, the index and a +future REST surface stay in lockstep. The contract is **frozen** (breaking to +change), and because it is generated rather than hand-written, a _consumer_ guards +it with a `printGraphQLSchema(searchType, options)` SDL snapshot over its **own** +search type and `typeName` — that snapshot also catches a `buildGraphQLSchema` +change in a future version of this library silently altering the consumer’s +contract. diff --git a/packages/search-api-graphql/eslint.config.mjs b/packages/search-api-graphql/eslint.config.mjs new file mode 100644 index 00000000..2dcaf60c --- /dev/null +++ b/packages/search-api-graphql/eslint.config.mjs @@ -0,0 +1,22 @@ +import baseConfig from '../../eslint.config.mjs'; + +export default [ + ...baseConfig, + { + files: ['**/*.json'], + rules: { + '@nx/dependency-checks': [ + 'error', + { + ignoredFiles: [ + '{projectRoot}/eslint.config.{js,cjs,mjs}', + '{projectRoot}/vite.config.{js,ts,mjs,mts}', + ], + }, + ], + }, + languageOptions: { + parser: await import('jsonc-eslint-parser'), + }, + }, +]; diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json new file mode 100644 index 00000000..70f76450 --- /dev/null +++ b/packages/search-api-graphql/package.json @@ -0,0 +1,32 @@ +{ + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchType at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the search type and typeName; it names neither your domain nor your engine.", + "repository": { + "url": "git+https://github.com/ldelements/lde.git", + "directory": "packages/search-api-graphql" + }, + "license": "MIT", + "type": "module", + "exports": { + "./package.json": "./package.json", + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "development": "./src/index.ts", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "files": [ + "dist", + "!**/*.tsbuildinfo" + ], + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } +} diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts new file mode 100644 index 00000000..836cc2a6 --- /dev/null +++ b/packages/search-api-graphql/src/build-schema.ts @@ -0,0 +1,514 @@ +import { + GraphQLBoolean, + GraphQLEnumType, + GraphQLFloat, + GraphQLInputObjectType, + GraphQLInt, + GraphQLList, + GraphQLNonNull, + GraphQLObjectType, + GraphQLSchema, + GraphQLString, + printSchema, + type GraphQLEnumValueConfigMap, + type GraphQLFieldConfig, + type GraphQLInputFieldConfig, + type GraphQLInputType, + type GraphQLOutputType, +} from 'graphql'; +import { + facetableFields, + filterableFields, + filterOperatorFor, + outputFields, + sortableFields, + type Filter, + type LocalizedValue, + type Reference, + type SearchEngine, + type SearchField, + type SearchQuery, + type SearchType, + type Sort, +} from '@lde/search'; +import { + defaultLanguageOrder, + toLanguageStrings, + type LanguageOrder, +} from './language.js'; + +/** Populated per request by the transport; no framework type appears here. */ +export interface SearchContext { + readonly engine: SearchEngine; + /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ + readonly acceptLanguage: readonly string[]; + /** + * Called when a single facet's computation fails. The facet degrades to an + * empty list (a supplementary facet must not fail the whole query); supply + * this to log the cause. Optional — omit to swallow silently. + */ + readonly onFacetError?: (field: string, error: unknown) => void; +} + +export interface BuildGraphQLSchemaOptions { + /** Drives all derived type names, e.g. `Dataset`. */ + readonly typeName: string; + /** Root query field; defaults to the lowercased plural of `typeName`. */ + readonly queryField?: string; + /** Consumer policy applied to every query (default status, sort, tie-breaks). */ + readonly queryDefaults?: ( + query: SearchQuery, + context: SearchContext, + ) => SearchQuery; + /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ + readonly languageOrder?: LanguageOrder; +} + +type Source = Record; + +const nonNullListOf = (type: GraphQLOutputType): GraphQLOutputType => + new GraphQLNonNull(new GraphQLList(new GraphQLNonNull(type))); + +const scalarOutput = ( + scalar: GraphQLOutputType, + field: SearchField, +): GraphQLOutputType => + field.required === true ? new GraphQLNonNull(scalar) : scalar; + +/** SCREAMING_SNAKE_CASE for an enum value name, e.g. `datePosted` → `DATE_POSTED`. */ +function screamingSnake(name: string): string { + return name.replace(/([a-z0-9])([A-Z])/g, '$1_$2').toUpperCase(); +} + +/** + * Construct an executable GraphQL schema from the unified {@link SearchField} + * model at runtime — no codegen, no SDL artifact. One generic resolver maps the + * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result + * back; the field model only parameterises data. + */ +export function buildGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, +): GraphQLSchema { + const { typeName } = options; + const languageOrder = options.languageOrder ?? defaultLanguageOrder; + const queryField = + options.queryField ?? + `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + + const languageString = new GraphQLObjectType({ + name: 'LanguageString', + fields: { + language: { type: GraphQLString }, + value: { type: new GraphQLNonNull(GraphQLString) }, + }, + }); + // A plain value facet bucket: a selection key, its count, and (for reference + // facets) the engine-resolved data label; null for token/free-string facets + // whose display the consumer owns. + const valueBucket = new GraphQLObjectType({ + name: 'ValueBucket', + fields: { + value: { type: new GraphQLNonNull(GraphQLString) }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + label: { + type: new GraphQLList(new GraphQLNonNull(languageString)), + resolve: (bucket: Source, _args: unknown, context: SearchContext) => { + const label = bucket.label as LocalizedValue | undefined; + return label + ? toLanguageStrings(label, context.acceptLanguage, languageOrder) + : null; + }, + }, + }, + }); + // A numeric range-facet bin: half-open `[min, max)` bounds (max null on an + // open-ended top bin) and the count of documents in it. + const rangeBucket = new GraphQLObjectType({ + name: 'RangeBucket', + fields: { + min: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.min ?? null, + }, + max: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.max ?? null, + }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + }, + }); + const sortDirection = new GraphQLEnumType({ + name: 'SortDirection', + values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, + }); + const stringFilter = new GraphQLInputObjectType({ + name: 'StringFilter', + fields: { + in: { type: new GraphQLList(new GraphQLNonNull(GraphQLString)) }, + }, + }); + const intRange = rangeInput('IntRange', GraphQLInt); + const floatRange = rangeInput('FloatRange', GraphQLFloat); + const dateRange = rangeInput('DateRange', GraphQLString); + + const labelList = ( + resolveLabel: (source: Source) => LocalizedValue | undefined, + ) => ({ + type: nonNullListOf(languageString), + resolve: (source: Source, _args: unknown, context: SearchContext) => { + const value = resolveLabel(source); + return value + ? toLanguageStrings(value, context.acceptLanguage, languageOrder) + : []; + }, + }); + + // One reference type per referenced shape, reused by every field. + const referenceTypes = new Map(); + for (const field of outputFields(searchType)) { + if ( + field.kind === 'reference' && + field.ref && + !referenceTypes.has(field.ref.type) + ) { + referenceTypes.set( + field.ref.type, + new GraphQLObjectType({ + name: field.ref.type, + fields: { + id: { + type: new GraphQLNonNull(GraphQLString), + resolve: (source: Source) => (source as unknown as Reference).id, + }, + name: labelList((source) => (source as unknown as Reference).label), + }, + }), + ); + } + } + + const outputType = new GraphQLObjectType({ + name: typeName, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = { + id: { type: new GraphQLNonNull(GraphQLString) }, + }; + for (const field of outputFields(searchType)) { + fields[field.name] = outputFieldConfig(field); + } + return fields; + }, + }); + + function outputFieldConfig( + field: SearchField, + ): GraphQLFieldConfig { + const passthrough = (source: Source) => source[field.name] ?? null; + switch (field.kind) { + case 'text': + return labelList( + (source) => source[field.name] as LocalizedValue | undefined, + ); + case 'keyword': + return field.array === true + ? { + type: nonNullListOf(GraphQLString), + resolve: (s) => s[field.name] ?? [], + } + : { type: scalarOutput(GraphQLString, field), resolve: passthrough }; + case 'reference': { + const referenceType = referenceTypes.get(field.ref?.type ?? '')!; + return field.array === true + ? { + type: nonNullListOf(referenceType), + resolve: (s) => s[field.name] ?? [], + } + : { + type: + field.required === true + ? new GraphQLNonNull(referenceType) + : referenceType, + resolve: passthrough, + }; + } + case 'integer': + return { type: scalarOutput(GraphQLInt, field), resolve: passthrough }; + case 'number': + return { + type: scalarOutput(GraphQLFloat, field), + resolve: passthrough, + }; + case 'date': + // Stored as Unix seconds (int64); the surface serves ISO 8601 (ADR 4). + return { + type: scalarOutput(GraphQLString, field), + resolve: (source) => { + const value = source[field.name]; + return typeof value === 'number' + ? new Date(value * 1000).toISOString() + : (value ?? null); + }, + }; + case 'boolean': + return { + type: new GraphQLNonNull(GraphQLBoolean), + resolve: (source) => source[field.name] === true, + }; + } + } + + const whereInput = new GraphQLInputObjectType({ + name: `${typeName}Where`, + fields: () => { + const fields: Record = {}; + for (const field of filterableFields(searchType)) { + fields[field.name] = { type: whereFieldType(field) }; + } + return fields; + }, + }); + + function whereFieldType(field: SearchField): GraphQLInputType { + switch (filterOperatorFor(field.kind)) { + case 'in': + return stringFilter; + case 'range': + return field.kind === 'integer' + ? intRange + : field.kind === 'number' + ? floatRange + : dateRange; + default: + return GraphQLBoolean; + } + } + + const sortValues: GraphQLEnumValueConfigMap = { + RELEVANCE: { value: 'relevance' }, + }; + for (const field of sortableFields(searchType)) { + sortValues[screamingSnake(field.name)] = { value: field.name }; + } + const sortField = new GraphQLEnumType({ + name: `${typeName}SortField`, + values: sortValues, + }); + const orderByInput = new GraphQLInputObjectType({ + name: `${typeName}OrderBy`, + fields: { + field: { type: new GraphQLNonNull(sortField) }, + direction: { + type: new GraphQLNonNull(sortDirection), + defaultValue: 'desc', + }, + }, + }); + + // Keyed facets object: one field per facetable field, typed by its kind + // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver + // computes that facet with its OWN where-filter removed (skip-own-filter), so a + // multi-select facet still lists its other options; only the selected fields + // are resolved (GraphQL prunes the rest), so the selection IS the request. + const facetsType = new GraphQLObjectType({ + name: `${typeName}Facets`, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const field of facetableFields(searchType)) { + const isRange = + field.facetRanges !== undefined && field.facetRanges.length > 0; + fields[field.name] = { + type: nonNullListOf(isRange ? rangeBucket : valueBucket), + resolve: async ( + source: Source, + _args: unknown, + context: SearchContext, + ) => { + const query = source.query as SearchQuery; + // Drop this facet's own filter so its other options still count + // (a removed `status` filter also drops the valid-only default, so + // the status facet counts across every status). + const facetQuery: SearchQuery = { + ...query, + where: query.where.filter( + (filter) => filter.field !== field.name, + ), + facets: [field.name], + limit: 0, + offset: 0, + }; + // A facet is supplementary: degrade a failed facet to an empty list + // rather than failing the whole query (which would null the non-null + // result and discard the items + every other facet). + try { + const result = await context.engine.search( + facetQuery, + searchType, + ); + return result.facets[field.name] ?? []; + } catch (error) { + context.onFacetError?.(field.name, error); + return []; + } + }, + }; + } + return fields; + }, + }); + + const resultType = new GraphQLObjectType({ + name: `${typeName}SearchResult`, + fields: { + items: { type: nonNullListOf(outputType) }, + total: { type: new GraphQLNonNull(GraphQLInt) }, + page: { type: new GraphQLNonNull(GraphQLInt) }, + perPage: { type: new GraphQLNonNull(GraphQLInt) }, + // Resolved lazily, per selected key (skip-own-filter); the result object + // (which carries the resolved `query`) is the facets source. + facets: { + type: new GraphQLNonNull(facetsType), + resolve: (source: Source) => source, + }, + }, + }); + + const query = new GraphQLObjectType({ + name: 'Query', + fields: { + [queryField]: { + type: new GraphQLNonNull(resultType), + args: { + query: { type: GraphQLString }, + where: { type: whereInput }, + orderBy: { type: orderByInput }, + page: { type: GraphQLInt, defaultValue: 1 }, + perPage: { type: GraphQLInt, defaultValue: 20 }, + }, + resolve: async (_source, args, context: SearchContext) => { + const built = argsToQuery(args as QueryArgs, context, searchType); + const finalQuery = options.queryDefaults + ? options.queryDefaults(built, context) + : built; + // Items + total only; facets are resolved lazily per selected key. + const result = await context.engine.search( + { ...finalQuery, facets: [] }, + searchType, + ); + return { + items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), + total: result.total, + // Guard against a `perPage: 0` arg: `Math.floor(0/0)` is NaN, which a + // non-null `Int!` cannot serialize and would fail the whole query. + page: + finalQuery.limit > 0 + ? Math.floor(finalQuery.offset / finalQuery.limit) + 1 + : 1, + perPage: finalQuery.limit, + // Carried for the facets resolver (skip-own-filter per key). + query: finalQuery, + }; + }, + }, + }, + }); + + return new GraphQLSchema({ query }); +} + +/** + * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an + * optional CI snapshot test over its own schema, catching accidental breaking + * changes to its frozen contract (including a `buildGraphQLSchema` change in a + * future version of this library silently altering it). + */ +export function printGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, +): string { + return printSchema(buildGraphQLSchema(searchType, options)); +} + +interface QueryArgs { + readonly query?: string; + readonly where?: Record; + readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; + readonly page?: number; + readonly perPage?: number; +} + +/** Pure args → {@link SearchQuery} mapping. */ +function argsToQuery( + args: QueryArgs, + context: SearchContext, + searchType: SearchType, +): SearchQuery { + const perPage = args.perPage ?? 20; + const page = args.page ?? 1; + return { + text: args.query, + where: whereToFilters(args.where, searchType), + orderBy: args.orderBy + ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] + : [], + limit: perPage, + offset: (page - 1) * perPage, + // Facets are requested per-key by the facets resolver, not via an arg. + facets: [], + locale: context.acceptLanguage[0] ?? 'und', + }; +} + +function whereToFilters( + where: Record | undefined, + searchType: SearchType, +): Filter[] { + if (where === undefined) { + return []; + } + const filters: Filter[] = []; + for (const field of filterableFields(searchType)) { + const value = where[field.name]; + if (value === undefined || value === null) { + continue; + } + switch (filterOperatorFor(field.kind)) { + case 'in': + filters.push({ + field: field.name, + in: (value as { in?: string[] }).in ?? [], + }); + break; + case 'range': { + const range = value as { min?: number | string; max?: number | string }; + filters.push({ + field: field.name, + range: { min: range.min, max: range.max }, + }); + break; + } + default: + filters.push({ field: field.name, is: value as boolean }); + } + } + return filters; +} + +function rangeInput( + name: string, + bound: typeof GraphQLInt | typeof GraphQLFloat | typeof GraphQLString, +): GraphQLInputObjectType { + return new GraphQLInputObjectType({ + name, + fields: { min: { type: bound }, max: { type: bound } }, + }); +} + +// Re-exported for callers that compose a sort manually. +export type { Sort }; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts new file mode 100644 index 00000000..20c13223 --- /dev/null +++ b/packages/search-api-graphql/src/index.ts @@ -0,0 +1,7 @@ +export { buildGraphQLSchema, printGraphQLSchema } from './build-schema.js'; +export type { + SearchContext, + BuildGraphQLSchemaOptions, +} from './build-schema.js'; +export { defaultLanguageOrder, toLanguageStrings } from './language.js'; +export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/src/language.ts b/packages/search-api-graphql/src/language.ts new file mode 100644 index 00000000..96826f65 --- /dev/null +++ b/packages/search-api-graphql/src/language.ts @@ -0,0 +1,47 @@ +import type { LocalizedValue } from '@lde/search'; + +/** One entry of the surface’s best-first `[LanguageString!]!`. `language` is null + * for untagged (`und`) values; `[0]` is the value to display and `[0].language` + * is the language actually served (the per-field `Content-Language`). */ +export interface LanguageString { + readonly language: string | null; + readonly value: string; +} + +/** Orders a localized value’s available languages against the request. */ +export type LanguageOrder = ( + available: readonly string[], + accept: readonly string[], +) => readonly string[]; + +/** + * Default ordering: requested languages first (in request order), then the + * remaining tagged languages, then untagged (`und`) last — so `[0]` is always the + * best available value. + */ +export const defaultLanguageOrder: LanguageOrder = (available, accept) => { + const requested = accept.filter((language) => available.includes(language)); + const rest = available.filter( + (language) => language !== 'und' && !requested.includes(language), + ); + const untagged = available.includes('und') ? ['und'] : []; + return [...requested, ...rest, ...untagged]; +}; + +/** Flatten a language map into a best-first `LanguageString` list. */ +export function toLanguageStrings( + value: LocalizedValue, + accept: readonly string[], + order: LanguageOrder, +): LanguageString[] { + const result: LanguageString[] = []; + for (const language of order(Object.keys(value), accept)) { + for (const text of value[language] ?? []) { + result.push({ + language: language === 'und' ? null : language, + value: text, + }); + } + } + return result; +} diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..63bc19de --- /dev/null +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,101 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` +"type Query { + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20): ThingSearchResult! +} + +type ThingSearchResult { + items: [Thing!]! + total: Int! + page: Int! + perPage: Int! + facets: ThingFacets! +} + +type Thing { + id: String! + title: [LanguageString!]! + description: [LanguageString!]! + keyword: [String!]! + creator: [Agent!]! + publisher: Agent + size: Int + score: Float + created: String + status: String! + open: Boolean! +} + +type LanguageString { + language: String + value: String! +} + +type Agent { + id: String! + name: [LanguageString!]! +} + +type ThingFacets { + keyword: [ValueBucket!]! + creator: [ValueBucket!]! + publisher: [ValueBucket!]! + status: [ValueBucket!]! + open: [ValueBucket!]! +} + +type ValueBucket { + value: String! + count: Int! + label: [LanguageString!] +} + +input ThingWhere { + keyword: StringFilter + creator: StringFilter + publisher: StringFilter + size: IntRange + score: FloatRange + created: DateRange + status: StringFilter + open: Boolean +} + +input StringFilter { + in: [String!] +} + +input IntRange { + min: Int + max: Int +} + +input FloatRange { + min: Float + max: Float +} + +input DateRange { + min: String + max: String +} + +input ThingOrderBy { + field: ThingSortField! + direction: SortDirection! = DESC +} + +enum ThingSortField { + RELEVANCE + TITLE + SIZE + CREATED +} + +enum SortDirection { + ASC + DESC +} +" +`; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts new file mode 100644 index 00000000..243b0ec9 --- /dev/null +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -0,0 +1,503 @@ +import { describe, expect, it } from 'vitest'; +import { graphql, printSchema } from 'graphql'; +import type { + SearchEngine, + SearchQuery, + SearchResult, + SearchType, +} from '@lde/search'; +import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; + +const schema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10 }, + ], + }, + { name: 'datePosted', kind: 'date', sortable: true, output: true }, + { name: 'score', kind: 'number', output: true }, + { + name: 'terminologySource', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'Term', strategy: 'labelOnly' }, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'iiif', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +/** A fake engine that records the query it received and returns a canned result. */ +function fakeEngine(result: SearchResult): { + engine: SearchEngine; + received: () => SearchQuery; +} { + let captured: SearchQuery; + return { + engine: { + async search(query) { + captured = query; + return result; + }, + }, + received: () => captured, + }; +} + +const canned: SearchResult = { + total: 1, + hits: [ + { + id: 'https://d/1', + document: { + title: { nl: ['Titel'], en: ['Title'] }, + keyword: ['kaarten'], + publisher: { + id: 'https://org/1', + label: { nl: ['Het Utrechts Archief'] }, + }, + size: 1234, + datePosted: 1_700_000_000, + score: 4.5, + terminologySource: [ + { id: 'https://term/1', label: { nl: ['Kaarten'] } }, + ], + status: 'valid', + iiif: true, + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, +}; + +async function run( + source: string, + context: SearchContext, + variables?: Record, +) { + return graphql({ + schema: buildGraphQLSchema(schema, { typeName: 'Dataset' }), + source, + contextValue: context, + variableValues: variables, + }); +} + +describe('buildGraphQLSchema', () => { + it('resolves a query, mapping the result to the typed output', async () => { + const { engine, received } = fakeEngine(canned); + const result = await run( + `{ + datasets(query: "kaart") { + total + page + perPage + items { + id + title { language value } + keyword + publisher { id name { language value } } + terminologySource { id name { language value } } + size + datePosted + score + status + iiif + } + facets { keyword { value count } } + } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect(data.page).toBe(1); + const item = (data.items as Record[])[0]; + expect(item.id).toBe('https://d/1'); + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: 'en', value: 'Title' }, + ]); + expect(item.keyword).toEqual(['kaarten']); + expect(item.publisher).toEqual({ + id: 'https://org/1', + name: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }); + expect(item.size).toBe(1234); + expect(item.datePosted).toBe('2023-11-14T22:13:20.000Z'); + expect(item.score).toBe(4.5); + expect(item.terminologySource).toEqual([ + { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, + ]); + expect(item.iiif).toBe(true); + expect(data.facets).toEqual({ + keyword: [{ value: 'kaarten', count: 3 }], + }); + // The free-text arg became the query text. + expect(received().text).toBe('kaart'); + }); + + it('orders the output list best-first for the requested language', async () => { + const { engine } = fakeEngine(canned); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'en', value: 'Title' }, + { language: 'nl', value: 'Titel' }, + ]); + }); + + it('places untagged (und) values last with a null language', async () => { + const { engine } = fakeEngine({ + total: 1, + facets: {}, + hits: [ + { + id: 'x', + document: { title: { nl: ['Titel'], und: ['Naamloos'] } }, + }, + ], + }); + const result = await run( + `{ datasets { items { title { language value } datePosted } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: null, value: 'Naamloos' }, + ]); + // An absent date resolves to null (the non-numeric branch). + expect(item.datePosted).toBeNull(); + }); + + it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + publisher: [ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + publisher { value count label { language value } } + keyword { value count label { language value } } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + publisher: unknown[]; + keyword: unknown[]; + }; + expect(facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }, + ]); + expect(facets.keyword).toEqual([ + { value: 'kaarten', count: 3, label: null }, + ]); + }); + + it('exposes range-facet bucket bounds, null for value facets and open ends', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + size: [ + { value: '0', count: 2, min: 1, max: 10 }, + // Open-ended top bin: lower bound only. + { value: '1', count: 5, min: 10 }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + size { min max count } + keyword { value count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + size: unknown[]; + keyword: unknown[]; + }; + // RangeBuckets carry their half-open bounds (max null = open-ended top bin). + expect(facets.size).toEqual([ + { min: 1, max: 10, count: 2 }, + { min: 10, max: null, count: 5 }, + ]); + // A value facet's ValueBuckets carry no bounds. + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); + + it('resolves every selected facet key, returning [] where the engine has none', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + const result = await run( + `{ datasets { facets { + keyword { value count } + publisher { value count } + terminologySource { value count } + status { value count } + iiif { value count } + size { min max count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as Record; + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 1 }]); + // Keys the engine returned nothing for resolve to an empty list. + for (const key of [ + 'publisher', + 'terminologySource', + 'status', + 'iiif', + 'size', + ]) { + expect(facets[key]).toEqual([]); + } + }); + + it('computes a facet with its own where-filter removed (skip-own-filter)', async () => { + const { engine, received } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + await run( + `{ datasets(where: { keyword: { in: ["x"] }, status: { in: ["valid"] } }) { + facets { keyword { value count } } + } }`, + { engine, acceptLanguage: ['nl'] }, + ); + // The keyword facet query is run with the keyword filter dropped (so its + // other options still count), but other filters (status) retained. + const facetQuery = received(); + expect(facetQuery.facets).toEqual(['keyword']); + expect( + facetQuery.where.find((filter) => filter.field === 'keyword'), + ).toBeUndefined(); + expect(facetQuery.where).toContainEqual({ field: 'status', in: ['valid'] }); + }); + + it('degrades a failed facet to an empty list without failing the whole query', async () => { + // A facet is supplementary: its computation runs a separate search (with + // `facets` set). Fail only that, leaving the listing search untouched. + const failedFacets: string[] = []; + const engine: SearchEngine = { + async search(query) { + if (query.facets.length > 0) { + throw new Error('facet backend unavailable'); + } + return canned; + }, + }; + const result = await run( + `{ datasets { + total + items { id } + facets { keyword { value count } } + } }`, + { + engine, + acceptLanguage: ['nl'], + onFacetError: (field) => failedFacets.push(field), + }, + ); + + // No top-level error: the failed facet degraded rather than nulling the + // non-null result and discarding the items. + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect((data.items as Record[])[0].id).toBe('https://d/1'); + // The failed facet degraded to an empty list, and the cause was reported. + expect((data.facets as Record).keyword).toEqual([]); + expect(failedFacets).toEqual(['keyword']); + }); + + it('guards perPage: 0, resolving page to 1 rather than failing on NaN', async () => { + const { engine } = fakeEngine(canned); + const result = await run(`{ datasets(perPage: 0) { page total } }`, { + engine, + acceptLanguage: ['nl'], + }); + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.page).toBe(1); + }); + + it('maps where, orderBy and pagination into the SearchQuery', async () => { + const { engine, received } = fakeEngine(canned); + await run( + `{ + datasets( + where: { status: { in: ["valid"] }, keyword: {}, size: { min: 1, max: 9 }, iiif: true } + orderBy: { field: SIZE, direction: ASC } + page: 3 + perPage: 10 + ) { total } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + const query = received(); + expect(query.where).toContainEqual({ field: 'status', in: ['valid'] }); + // An empty StringFilter compiles to an empty membership. + expect(query.where).toContainEqual({ field: 'keyword', in: [] }); + expect(query.where).toContainEqual({ + field: 'size', + range: { min: 1, max: 9 }, + }); + expect(query.where).toContainEqual({ field: 'iiif', is: true }); + expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); + // Facets are requested per key via selection, not an arg; the listing query + // carries none. + expect(query.facets).toEqual([]); + expect(query.limit).toBe(10); + expect(query.offset).toBe(20); + }); + + it('falls back to the und locale when no Accept-Language is given', async () => { + const { engine, received } = fakeEngine(canned); + await run(`{ datasets { total } }`, { engine, acceptLanguage: [] }); + expect(received().locale).toBe('und'); + }); + + it('applies queryDefaults before calling the engine', async () => { + let captured: SearchQuery | undefined; + const engine: SearchEngine = { + async search(query) { + captured = query; + return canned; + }, + }; + const gqlSchema = buildGraphQLSchema(schema, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + orderBy: [{ field: 'relevance', direction: 'desc' }], + }), + }); + await graphql({ + schema: gqlSchema, + source: `{ datasets { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(captured?.where).toEqual([{ field: 'status', in: ['valid'] }]); + expect(captured?.orderBy).toEqual([ + { field: 'relevance', direction: 'desc' }, + ]); + }); + + it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); + expect(sdl).toMatch(/status: String!/); // required + expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable + expect(sdl).toMatch(/title: \[LanguageString!\]!/); + expect(sdl).toMatch(/keyword: \[String!\]!/); + expect(sdl).toMatch(/iiif: Boolean!/); + expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference + }); + + it('builds the where, orderBy enum and keyed facets object from the field model', () => { + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); + expect(sdl).toMatch(/enum DatasetSortField/); + expect(sdl).toMatch(/RELEVANCE/); + expect(sdl).toMatch(/SIZE/); + // Facets are a keyed object, one field per facetable field, typed by kind. + expect(sdl).toMatch(/type DatasetFacets/); + expect(sdl).toMatch(/keyword: \[ValueBucket!\]!/); + expect(sdl).toMatch(/size: \[RangeBucket!\]!/); + expect(sdl).toMatch(/input DatasetWhere/); + expect(sdl).toMatch(/status: StringFilter/); + expect(sdl).toMatch(/size: IntRange/); + }); +}); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts new file mode 100644 index 00000000..c78b1535 --- /dev/null +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchType } from '@lde/search'; +import { printGraphQLSchema } from '../src/build-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. Its + * SDL is snapshotted purely to pin the **generator**: any change to how + * `buildGraphQLSchema` maps the field model (nullability, type names, enums, + * reference reuse) surfaces as a snapshot diff before this library is published, + * so a consumer’s contract can’t shift from under it by accident. + */ +const THING: SearchType = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + required: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + // Two references sharing a shape → the Agent type is emitted once and reused. + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'score', kind: 'number', filterable: true, output: true }, + { + name: 'created', + kind: 'date', + filterable: true, + sortable: true, + output: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'open', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('GraphQL generator stability', () => { + it('emits a stable SDL for a representative schema', () => { + expect(printGraphQLSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + }); +}); diff --git a/packages/search-api-graphql/tsconfig.json b/packages/search-api-graphql/tsconfig.json new file mode 100644 index 00000000..62ebbd94 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "files": [], + "include": [], + "references": [ + { + "path": "./tsconfig.lib.json" + }, + { + "path": "./tsconfig.spec.json" + } + ] +} diff --git a/packages/search-api-graphql/tsconfig.lib.json b/packages/search-api-graphql/tsconfig.lib.json new file mode 100644 index 00000000..64610bac --- /dev/null +++ b/packages/search-api-graphql/tsconfig.lib.json @@ -0,0 +1,26 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "tsBuildInfoFile": "dist/tsconfig.lib.tsbuildinfo", + "emitDeclarationOnly": false, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "references": [{ "path": "../search/tsconfig.lib.json" }], + "exclude": [ + "vite.config.ts", + "vite.config.mts", + "vitest.config.ts", + "vitest.config.mts", + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx" + ] +} diff --git a/packages/search-api-graphql/tsconfig.spec.json b/packages/search-api-graphql/tsconfig.spec.json new file mode 100644 index 00000000..04480f69 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.spec.json @@ -0,0 +1,29 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./out-tsc/vitest", + "types": [ + "vitest/globals", + "vitest/importMeta", + "vite/client", + "node", + "vitest" + ] + }, + "include": [ + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx", + "test/**/*.d.ts" + ], + "references": [ + { + "path": "./tsconfig.lib.json" + } + ] +} diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts new file mode 100644 index 00000000..7434ca80 --- /dev/null +++ b/packages/search-api-graphql/vite.config.ts @@ -0,0 +1,21 @@ +/// +import { defineConfig, mergeConfig } from 'vite'; +import baseConfig from '../../vite.base.config.js'; + +export default mergeConfig( + baseConfig, + defineConfig({ + root: __dirname, + cacheDir: '../../node_modules/.vite/packages/search-api-graphql', + test: { + coverage: { + thresholds: { + functions: 100, + lines: 100, + branches: 88.63, + statements: 100, + }, + }, + }, + }), +); diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index b5d62bb9..efffc145 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -1,13 +1,27 @@ # @lde/search-typesense -[Typesense](https://typesense.org/) engine adapter for RDF-backed search -pipelines. Engine-specific (Typesense) but domain-agnostic – the caller supplies -the collection schema and documents. - -The engine-agnostic half of the pipeline – framing `CONSTRUCT` quads into a -JSON-LD IR and projecting that IR into flat documents from a declarative field -spec – lives in [`@lde/search`](../search). This package consumes those -documents and writes them to Typesense. +[Typesense](https://typesense.org/) engine adapter for the engine- and +domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but +domain-agnostic** – you supply a `SearchType`; this package never names your +domain. It is the Typesense implementation of the `SearchEngine` port: it derives +a collection schema from the field model, compiles the neutral `SearchQuery` into +Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, +and manages the index lifecycle (blue/green rebuild). + +## Collection schema and engine + +`buildCollectionSchema(searchType, { name, defaultSortingField, … })` derives a +Typesense collection from the unified `SearchField` model — the Typesense field +type comes from each field’s `kind`, and the physical fanout (per-locale +search/sort keys) matches what the projection writes, via +`@lde/search`’s `physicalFields`, so the index and the documents cannot drift. + +`createTypesenseSearchEngine(client, { collection, labelsCollection })` is the +`SearchEngine` implementation: it compiles the query, runs the search, resolves +reference (and reference-facet) labels from the sidecar `labels` collection in a +single lookup, and reconstructs the logical `SearchResult` — language maps, +labelled references, labelled facet buckets. The pure halves `buildSearchParams` +and `parseSearchResponse` are exported for direct use and testing. ## Indexing diff --git a/packages/search-typesense/package.json b/packages/search-typesense/package.json index b1dde852..445624fb 100644 --- a/packages/search-typesense/package.json +++ b/packages/search-typesense/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-typesense", "version": "0.1.1", - "description": "Generic Typesense engine adapter for RDF-backed search pipelines: collection lifecycle, bulk upsert and blue/green alias swap", + "description": "Typesense implementation of the @lde/search SearchEngine port: collection-schema builder, query compiler, label-resolving result reconstruction, and blue/green index lifecycle. Engine-specific (Typesense) but domain-agnostic.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-typesense" @@ -25,6 +25,8 @@ "!**/*.tsbuildinfo" ], "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts new file mode 100644 index 00000000..af133b08 --- /dev/null +++ b/packages/search-typesense/src/collection-schema.ts @@ -0,0 +1,132 @@ +import type { CollectionCreateSchema } from 'typesense'; +import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; +import { physicalFields, type SearchField, type SearchType } from '@lde/search'; + +/** Deployment-specific options the generic field model does not carry. */ +export interface CollectionSchemaOptions { + /** The Typesense collection (or alias) name. */ + readonly name: string; + /** Snowball stemming locale for non-localized searchable fields (default `nl`). + * Localized text search fields stem in their own locale. */ + readonly defaultLocale?: string; + /** The field Typesense sorts by when a query imposes no order. */ + readonly defaultSortingField?: string; + /** Synonym sets the collection references (synced separately). */ + readonly synonymSets?: readonly string[]; +} + +/** + * Build a Typesense collection schema from the unified {@link SearchType}, so + * the index and the projection are driven by one declarative source and cannot + * drift. Each field fans out into the same physical fields the projection writes + * ({@link physicalFields}); the Typesense field type is derived from the field + * `kind`, never re-declared. + * + * Stemming is enabled on every folded `*_search` field: localized text stems + * each `*_search_${locale}` in its own language, and a non-localized searchable + * field stems in `defaultLocale`. + */ +export function buildCollectionSchema( + searchType: SearchType, + options: CollectionSchemaOptions, +): CollectionCreateSchema { + const defaultLocale = options.defaultLocale ?? 'nl'; + const collection: CollectionCreateSchema = { + name: options.name, + fields: searchType.fields.flatMap((field) => + typesenseFields(field, defaultLocale, options.defaultSortingField), + ), + }; + if (options.defaultSortingField !== undefined) { + collection.default_sorting_field = options.defaultSortingField; + } + if (options.synonymSets !== undefined) { + collection.synonym_sets = [...options.synonymSets]; + } + return collection; +} + +/** The physical Typesense fields one declaration produces. */ +function typesenseFields( + field: SearchField, + defaultLocale: string, + defaultSortingField: string | undefined, +): CollectionFieldSchema[] { + const names = physicalFields(field); + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + return [ + // Display labels: stored, not indexed for search (search uses the folded + // companions), accents preserved. + ...names.display.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + index: false, + optional: true, + }), + ), + // One folded search field per locale, each stemmed in its own language. + ...names.search.map( + (name, index): CollectionFieldSchema => ({ + name, + type: 'string', + optional: true, + stem: true, + locale: locales[index], + }), + ), + ...names.sort.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + sort: true, + optional: true, + }), + ), + ]; + } + + const valueType = typesenseValueType(field); + const fields: CollectionFieldSchema[] = [ + { + name: field.name, + type: valueType, + facet: field.facetable ?? false, + sort: field.sortable ?? false, + // A `required` field is non-optional; so is the `default_sorting_field`, + // which Typesense requires to be present. Everything else may be absent. + optional: field.required !== true && field.name !== defaultSortingField, + }, + ]; + if (field.searchable) { + for (const name of names.search) { + fields.push({ + name, + type: valueType, + optional: true, + stem: true, + locale: defaultLocale, + }); + } + } + return fields; +} + +/** The Typesense field type for a non-localized field, from its `kind`. 64-bit + * integers (and dates, stored as Unix seconds) so large counts never overflow. */ +function typesenseValueType(field: SearchField): CollectionFieldSchema['type'] { + switch (field.kind) { + case 'integer': + case 'date': + return 'int64'; + case 'number': + return 'float'; + case 'boolean': + return 'bool'; + case 'keyword': + case 'reference': + case 'text': + return field.array === true ? 'string[]' : 'string'; + } +} diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index 6514638d..66247957 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -1 +1,9 @@ export { rebuild } from './adapter.js'; +export { buildCollectionSchema } from './collection-schema.js'; +export type { CollectionSchemaOptions } from './collection-schema.js'; +export { buildSearchParams } from './query-compiler.js'; +export { createTypesenseSearchEngine, parseSearchResponse } from './search.js'; +export type { + TypesenseSearchEngineOptions, + TypesenseSearchResponse, +} from './search.js'; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts new file mode 100644 index 00000000..dfeede8c --- /dev/null +++ b/packages/search-typesense/src/query-compiler.ts @@ -0,0 +1,242 @@ +import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; +import { fold } from '@lde/text-normalization'; +import { + physicalFields, + searchableFields, + type FacetRange, + type Filter, + type SearchField, + type SearchQuery, + type SearchType, + type Sort, +} from '@lde/search'; + +/** + * Compile the engine-neutral {@link SearchQuery} into Typesense search + * parameters — the query half of the engine adapter. Pure (no client, no env), + * so the mapping is asserted directly in unit tests. Field names come from + * {@link physicalFields}, the same convention the projection and the collection + * schema use, so a query can never reference a field the index does not carry. + */ +export interface CompileOptions { + /** + * Cap on the number of buckets returned per facet (`max_facet_values`). Left + * unset, Typesense defaults to 10 — too few for high-cardinality facets + * (publisher, keyword), so a deployment with such facets must raise it. Range + * facets return one bucket per declared range regardless, but a value > the + * range count is still safe. + */ + readonly maxFacetValues?: number; +} + +export function buildSearchParams( + query: SearchQuery, + searchType: SearchType, + options: CompileOptions = {}, +): SearchParams { + const folded = + query.text !== undefined && query.text.length > 0 + ? fold(query.text) + : undefined; + const { names, weights } = queryFields(searchType, query.locale); + const filterBy = compileFilterBy(query.where, searchType); + const sortBy = query.orderBy + .map((sort) => compileSort(sort, searchType, query.locale)) + .join(','); + const params: SearchParams = { + q: folded ?? '*', + query_by: names.join(','), + query_by_weights: weights.join(','), + per_page: query.limit, + // A facet-only query (`limit: 0`) fetches no hits; page is then meaningless, + // so pin it to 1 rather than dividing by zero. + page: query.limit > 0 ? Math.floor(query.offset / query.limit) + 1 : 1, + }; + if (filterBy.length > 0) { + params.filter_by = filterBy; + } + if (sortBy.length > 0) { + params.sort_by = sortBy; + } + if (query.facets.length > 0) { + params.facet_by = compileFacetBy(query.facets, searchType); + if (options.maxFacetValues !== undefined) { + params.max_facet_values = options.maxFacetValues; + } + } + return params; +} + +/** + * The `facet_by` clause. A facet on a numeric field that declares + * {@link SearchField.facetRanges} faceted into those fixed half-open `[min, max)` + * bins (a histogram); every other facet is a plain per-value facet on its field + * name. Typesense range syntax is already start-inclusive/end-exclusive, so the + * declared bounds pass straight through with no boundary fix-up. + */ +function compileFacetBy( + facets: readonly string[], + searchType: SearchType, +): string { + return facets + .map((name) => { + const field = searchType.fields.find( + (candidate) => candidate.name === name, + ); + return field?.facetRanges !== undefined && field.facetRanges.length > 0 + ? compileRangeFacet(field.name, field.facetRanges) + : name; + }) + .join(','); +} + +/** `name(key:[min, max], …)`; a blank bound is open-ended (Typesense `[75, ]`). */ +function compileRangeFacet( + name: string, + ranges: readonly FacetRange[], +): string { + const bins = ranges + .map((range) => `${range.key}:[${range.min ?? ''}, ${range.max ?? ''}]`) + .join(', '); + return `${name}(${bins})`; +} + +/** + * The `query_by` fields and aligned weights. Each searchable field expands to its + * folded `*_search` companion(s); a localized field’s active-locale companion + * keeps its full weight while the other locale is gently demoted (−1, floored at + * 1), so a match in the user’s language ranks higher while cross-language matches + * still surface. + */ +function queryFields( + searchType: SearchType, + locale: string, +): { readonly names: string[]; readonly weights: number[] } { + const names: string[] = []; + const weights: number[] = []; + for (const field of searchableFields(searchType)) { + const search = physicalFields(field).search; + const baseWeight = field.searchable.weight; + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + search.forEach((name, index) => { + names.push(name); + weights.push( + locales[index] === locale ? baseWeight : Math.max(1, baseWeight - 1), + ); + }); + } else { + for (const name of search) { + names.push(name); + weights.push(baseWeight); + } + } + } + return { names, weights }; +} + +/** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ +function compileFilterBy( + where: readonly Filter[], + searchType: SearchType, +): string { + return where + .map((filter) => compileFilter(filter, searchType)) + .filter((clause): clause is string => clause !== undefined) + .join(' && '); +} + +function compileFilter( + filter: Filter, + searchType: SearchType, +): string | undefined { + const field = searchType.fields.find( + (candidate) => candidate.name === filter.field, + ); + if (field === undefined) { + return undefined; + } + if ('in' in filter) { + return filter.in.length > 0 + ? compileMembership(field, filter.in) + : undefined; + } + if ('range' in filter) { + return compileRange(field.name, filter.range); + } + return `${field.name}:=${filter.is}`; +} + +/** + * A membership clause. A non-facet (tokenized) field uses the exact `:=` + * operator so an IRI cannot partial-match on a shared path segment. + */ +function compileMembership( + field: SearchField, + values: readonly string[], +): string { + const exact = field.facetable !== true; + return membership(field.name, values, exact); +} + +function membership( + name: string, + values: readonly string[], + exact: boolean, +): string { + const list = `[${values.map(escapeFilterValue).join(',')}]`; + return exact ? `${name}:=${list}` : `${name}:${list}`; +} + +/** An inclusive Typesense range clause, or `undefined` when neither bound is set. */ +function compileRange( + name: string, + range: { readonly min?: number | string; readonly max?: number | string }, +): string | undefined { + const { min, max } = range; + if (min !== undefined && max !== undefined) { + return `${name}:[${min}..${max}]`; + } + if (min !== undefined) { + return `${name}:>=${min}`; + } + if (max !== undefined) { + return `${name}:<=${max}`; + } + return undefined; +} + +/** + * One `sort_by` term. `relevance` maps to Typesense’s `_text_match`; a localized + * text field sorts on its active-locale folded key; any other field (including a + * deployment tie-break like `status_rank`) sorts on its own name. + */ +function compileSort( + sort: Sort, + searchType: SearchType, + locale: string, +): string { + if (sort.field === 'relevance') { + return `_text_match:${sort.direction}`; + } + const field = searchType.fields.find( + (candidate) => candidate.name === sort.field, + ); + if ( + field !== undefined && + field.kind === 'text' && + field.localized === true + ) { + return `${field.name}_sort_${locale}:${sort.direction}`; + } + return `${sort.field}:${sort.direction}`; +} + +/** + * Backtick-wrap a filter value so reserved characters in IRIs and media types + * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. + * An embedded backtick is escaped. + */ +export function escapeFilterValue(value: string): string { + return `\`${value.replace(/`/g, '\\`')}\``; +} diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts new file mode 100644 index 00000000..fc52aca4 --- /dev/null +++ b/packages/search-typesense/src/search.ts @@ -0,0 +1,411 @@ +import type { Client } from 'typesense'; +import { + outputFields, + type FacetBucket, + type LocalizedValue, + type Reference, + type ResultDocument, + type SearchEngine, + type SearchField, + type SearchHit, + type SearchQuery, + type SearchResult, + type SearchType, + type SearchValue, +} from '@lde/search'; +import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; + +/** Where the engine reads documents and (optionally) reference labels. */ +export interface TypesenseSearchEngineOptions { + /** The dataset collection or alias to query. */ + readonly collection: string; + /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ + readonly labelsCollection?: string; + /** + * Buckets returned per facet (`max_facet_values`). Typesense defaults to 10; + * raise it for high-cardinality facets (publisher, keyword) so their long + * value lists are not truncated. + */ + readonly maxFacetValues?: number; + /** + * Called when reference-label resolution fails; the search then degrades to + * id-only references rather than failing. Optional — omit to swallow silently. + */ + readonly onLabelError?: (error: unknown) => void; + /** + * Opt-in in-memory label cache. When set (and {@link labelsCollection} is + * set), the FULL sidecar `labels` collection is loaded once via the documents + * export endpoint and held in a process-lifetime cache for this many + * milliseconds; each `search` then resolves its reference labels by in-memory + * lookup instead of a per-search `multi_search` round-trip. Omit to keep the + * per-search {@link fetchLabels} behaviour unchanged. + */ + readonly labelCacheTtlMs?: number; +} + +/** + * A Typesense-backed {@link SearchEngine}. `search` compiles the query + * ({@link buildSearchParams}), runs it, resolves the reference labels for the + * page of hits from the sidecar `labels` collection in one lookup, and + * reconstructs the engine-neutral {@link SearchResult} ({@link parseSearchResponse}). + * Every engine specific stays here; consumers see only logical documents. + */ +export function createTypesenseSearchEngine( + client: Client, + options: TypesenseSearchEngineOptions, +): SearchEngine { + // Process-lifetime cache for the FULL `labels` collection, held in the engine + // closure. Populated lazily on the first cached search; `loadAll` is the + // single-flight in-flight promise so concurrent first-loads share one export. + let cachedLabels: ReadonlyMap | undefined; + let cacheExpiresAt = 0; + let inFlightLoad: Promise> | undefined; + + function cachedAllLabels( + labelsCollection: string, + ttlMs: number, + ): Promise> { + if (cachedLabels !== undefined && Date.now() < cacheExpiresAt) { + return Promise.resolve(cachedLabels); + } + // Single-flight: a load already running serves every concurrent caller. + inFlightLoad ??= loadAllLabels(client, labelsCollection) + .then((loaded) => { + cachedLabels = loaded; + cacheExpiresAt = Date.now() + ttlMs; + return loaded; + }) + // A failed load degrades to id-only references and is NOT cached, so the + // next search retries rather than serving an empty map for the whole TTL. + .catch((error) => { + options.onLabelError?.(error); + return new Map(); + }) + .finally(() => { + inFlightLoad = undefined; + }); + return inFlightLoad; + } + + return { + async search( + query: SearchQuery, + searchType: SearchType, + ): Promise { + const params = buildSearchParams(query, searchType, { + maxFacetValues: options.maxFacetValues, + }); + const response = (await client + .collections(options.collection) + .documents() + .search(params)) as TypesenseSearchResponse; + // Labels are supplementary: a failed lookup (e.g. the sidecar collection + // mid-rebuild) degrades to id-only references rather than failing the whole + // search, so the listing still renders with bare IRIs. + let labels: ReadonlyMap = new Map(); + if (options.labelsCollection !== undefined) { + if (options.labelCacheTtlMs !== undefined) { + // Cached path: resolve the page's references by in-memory lookup + // against the once-loaded collection (no Typesense round-trip). + const allLabels = await cachedAllLabels( + options.labelsCollection, + options.labelCacheTtlMs, + ); + labels = selectLabels(allLabels, referenceIris(response, searchType)); + } else { + try { + labels = await fetchLabels( + client, + options.labelsCollection, + referenceIris(response, searchType), + ); + } catch (error) { + options.onLabelError?.(error); + } + } + } + return parseSearchResponse(response, searchType, labels); + }, + }; +} + +/** + * Load the FULL `labels` collection into a label map via the documents export + * endpoint, which streams every document as JSONL (one JSON object per line). + * Each line is reconstructed by {@link labelToLocalizedValue}, exactly as the + * per-search {@link fetchLabels} path does for its `multi_search` hits. + */ +async function loadAllLabels( + client: Pick, + collection: string, +): Promise> { + const jsonl = await client.collections(collection).documents().export(); + const labels = new Map(); + for (const line of jsonl.split('\n')) { + if (line.length === 0) { + continue; + } + const document = JSON.parse(line) as Record; + labels.set(String(document.id), labelToLocalizedValue(document)); + } + return labels; +} + +/** Narrow the cached collection to just the labels `iris` actually need. */ +function selectLabels( + allLabels: ReadonlyMap, + iris: readonly string[], +): Map { + const labels = new Map(); + for (const iri of iris) { + const label = allLabels.get(iri); + if (label !== undefined) { + labels.set(iri, label); + } + } + return labels; +} + +/** Every distinct reference IRI whose label the result will actually use. */ +function referenceIris( + response: TypesenseSearchResponse, + searchType: SearchType, +): string[] { + const referenceFieldSet = new Set( + searchType.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips + // non-output fields, so resolving a non-output reference's hit labels (e.g. a + // facet-only `class` with dozens of IRIs per hit) is pure waste. + const outputReferenceFields = outputFields(searchType) + .filter((field) => field.kind === 'reference') + .map((field) => field.name); + const iris = new Set(); + for (const hit of response.hits ?? []) { + for (const name of outputReferenceFields) { + const raw = hit.document[name]; + if (Array.isArray(raw)) { + for (const value of raw) { + iris.add(String(value)); + } + } else if (typeof raw === 'string') { + iris.add(raw); + } + } + } + // Reference-facet bucket values are IRIs too (incl. facet-only references like + // `class`); resolve them in the same lookup. + for (const facet of response.facet_counts ?? []) { + if (referenceFieldSet.has(facet.field_name)) { + for (const bucket of facet.counts) { + iris.add(bucket.value); + } + } + } + return [...iris]; +} + +/** + * Resolve labels for `iris` from the sidecar `labels` collection. Each + * `label_${locale}` becomes a language-map entry; the default `label` is the + * untagged (`und`) fallback when no locale variant exists. + * + * Sent over `multi_search` (POST) in batches: the id-list of a page or facet + * carrying many references — e.g. a dataset with dozens of classes — would + * overflow Typesense’s GET query-string limit (4000 chars, and IRIs URL-encode + * to several times their length) if it travelled in the URL. POST puts it in the + * body; the batch size stays under Typesense’s `per_page` cap. Exported for + * unit testing against a fake client. + */ +export async function fetchLabels( + client: Pick, + collection: string, + iris: readonly string[], +): Promise> { + const labels = new Map(); + for (let start = 0; start < iris.length; start += LABEL_BATCH_SIZE) { + const batch = iris.slice(start, start + LABEL_BATCH_SIZE); + const filter = `id:[${batch.map(escapeFilterValue).join(',')}]`; + const { results } = (await client.multiSearch.perform({ + searches: [ + { + collection, + q: '*', + query_by: 'label', + filter_by: filter, + per_page: batch.length, + }, + ], + })) as { results: readonly TypesenseSearchResponse[] }; + for (const hit of results[0]?.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } + } + return labels; +} + +/** Typesense caps `per_page` at 250; the multi_search POST body holds the + * id-list comfortably, so resolve references in batches of this size. */ +const LABEL_BATCH_SIZE = 200; + +/** Turn a `labels` document into a language map (`label_${locale}` → locale). */ +function labelToLocalizedValue( + document: Record, +): LocalizedValue { + const map: Record = {}; + for (const [key, value] of Object.entries(document)) { + if (key.startsWith('label_') && typeof value === 'string') { + map[key.slice('label_'.length)] = [value]; + } + } + if (Object.keys(map).length === 0 && typeof document.label === 'string') { + map.und = [document.label]; + } + return map; +} + +/** The subset of a Typesense search response this adapter reads. */ +export interface TypesenseSearchResponse { + readonly found: number; + readonly hits?: readonly { readonly document: Record }[]; + readonly facet_counts?: readonly { + readonly field_name: string; + readonly counts: readonly { + readonly value: string; + readonly count: number; + }[]; + }[]; +} + +/** + * Reconstruct a Typesense response into the engine-neutral {@link SearchResult}: + * the flat, fanned-out document is turned back into a logical one (per-locale + * display fields → a language map, reference IRIs → labelled references via the + * sidecar `labels` lookup, scalars passed through). `labels` maps a reference IRI + * to its resolved label; an IRI absent from it yields an id-only reference. + */ +export function parseSearchResponse( + response: TypesenseSearchResponse, + searchType: SearchType, + labels: ReadonlyMap, +): SearchResult { + const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ + id: String(hit.document.id), + document: reconstructDocument(hit.document, searchType, labels), + })); + // Reference facets are IRI-keyed; their buckets carry a resolved data label. + // Plain facets (tokens, free strings) carry no label — the consumer owns display. + const referenceFacets = new Set( + searchType.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + const facets: Record = {}; + for (const facet of response.facet_counts ?? []) { + const labelled = referenceFacets.has(facet.field_name); + // A range facet echoes the declared range key as the bucket value; look the + // bin's half-open bounds back up by key so the bucket is self-describing. + const field = searchType.fields.find( + (candidate) => candidate.name === facet.field_name, + ); + const rangesByKey = + field?.facetRanges !== undefined + ? new Map(field.facetRanges.map((range) => [range.key, range])) + : undefined; + facets[facet.field_name] = facet.counts.map((bucket) => { + const label = labelled ? labels.get(bucket.value) : undefined; + const range = rangesByKey?.get(bucket.value); + return { + value: bucket.value, + count: bucket.count, + ...(label !== undefined ? { label } : {}), + ...(range?.min !== undefined ? { min: range.min } : {}), + ...(range?.max !== undefined ? { max: range.max } : {}), + }; + }); + } + return { hits, total: response.found, facets }; +} + +/** Rebuild one logical document from a flat Typesense document. */ +function reconstructDocument( + flat: Record, + searchType: SearchType, + labels: ReadonlyMap, +): ResultDocument { + const document: Record = {}; + for (const field of outputFields(searchType)) { + if (field.kind === 'boolean') { + // A boolean is always present; an absent value means false. + document[field.name] = flat[field.name] === true; + continue; + } + const value = logicalValue(flat, field, labels); + if (value !== undefined) { + document[field.name] = value; + } + } + return document; +} + +function logicalValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + switch (field.kind) { + case 'text': + return localizedValue(flat, field); + case 'reference': + return referenceValue(flat, field, labels); + case 'keyword': { + const value = flat[field.name]; + return Array.isArray(value) || typeof value === 'string' + ? (value as SearchValue) + : undefined; + } + case 'integer': + case 'number': + case 'date': { + const value = flat[field.name]; + return typeof value === 'number' ? value : undefined; + } + case 'boolean': + return flat[field.name] === true; + } +} + +/** Gather the per-locale display fields back into a language map. */ +function localizedValue( + flat: Record, + field: SearchField, +): LocalizedValue | undefined { + const map: Record = {}; + for (const locale of field.locales ?? []) { + const value = flat[`${field.name}_${locale}`]; + if (typeof value === 'string') { + map[locale] = [value]; + } + } + return Object.keys(map).length > 0 ? map : undefined; +} + +/** Map stored reference IRIs to labelled references; id-only when no label. */ +function referenceValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + const raw = flat[field.name]; + if (raw === undefined) { + return undefined; + } + const iris = Array.isArray(raw) ? (raw as string[]) : [String(raw)]; + const references: Reference[] = iris.map((iri) => { + const label = labels.get(iri); + return label === undefined ? { id: iri } : { id: iri, label }; + }); + return field.array === true ? references : references[0]; +} diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..e56c6447 --- /dev/null +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,114 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`collection-schema generator stability > derives a stable Typesense collection for a representative schema 1`] = ` +{ + "default_sorting_field": "size", + "fields": [ + { + "index": false, + "name": "title_nl", + "optional": true, + "type": "string", + }, + { + "index": false, + "name": "title_en", + "optional": true, + "type": "string", + }, + { + "locale": "nl", + "name": "title_search_nl", + "optional": true, + "stem": true, + "type": "string", + }, + { + "locale": "en", + "name": "title_search_en", + "optional": true, + "stem": true, + "type": "string", + }, + { + "name": "title_sort_nl", + "optional": true, + "sort": true, + "type": "string", + }, + { + "name": "title_sort_en", + "optional": true, + "sort": true, + "type": "string", + }, + { + "facet": true, + "name": "keyword", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "locale": "nl", + "name": "keyword_search", + "optional": true, + "stem": true, + "type": "string[]", + }, + { + "facet": true, + "name": "format", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "creator", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "status", + "optional": false, + "sort": false, + "type": "string", + }, + { + "facet": true, + "name": "size", + "optional": false, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "score", + "optional": true, + "sort": false, + "type": "float", + }, + { + "facet": false, + "name": "created", + "optional": true, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "open", + "optional": true, + "sort": false, + "type": "bool", + }, + ], + "name": "things", + "synonym_sets": [ + "things-synonyms", + ], +} +`; diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts new file mode 100644 index 00000000..49711c1e --- /dev/null +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -0,0 +1,192 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +const schema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + path: 'https://def.nde.nl/format', + kind: 'keyword', + array: true, + facetable: true, + }, + // Derived fields (no path) still get collection fields — populated at index + // time by derivations, not projected. + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + { + name: 'size', + kind: 'integer', + facetable: true, + sortable: true, + }, + { name: 'iiif', kind: 'boolean', facetable: true }, + { + name: 'publisher', + path: 'http://purl.org/dc/terms/publisher', + kind: 'reference', + array: true, + facetable: true, + }, + { + name: 'datePosted', + path: 'https://def.nde.nl/datePosted', + kind: 'date', + sortable: true, + }, + { + name: 'score', + kind: 'number', + facetable: true, + }, + ], +}; + +describe('buildCollectionSchema', () => { + const collection = buildCollectionSchema(schema, { + name: 'datasets', + defaultLocale: 'nl', + defaultSortingField: 'statusRank', + synonymSets: ['dataset-synonyms'], + }); + + it('carries the collection name, default sorting field and synonym sets', () => { + expect(collection.name).toBe('datasets'); + expect(collection.default_sorting_field).toBe('statusRank'); + expect(collection.synonym_sets).toEqual(['dataset-synonyms']); + }); + + it('fans a localized text field into display, per-locale stemmed search and sort keys', () => { + expect(collection.fields).toContainEqual({ + name: 'title_nl', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_en', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_nl', + type: 'string', + optional: true, + stem: true, + locale: 'nl', + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_en', + type: 'string', + optional: true, + stem: true, + locale: 'en', + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_nl', + type: 'string', + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_en', + type: 'string', + sort: true, + optional: true, + }); + }); + + it('maps keyword/reference/integer/boolean kinds to Typesense value fields', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + // `status` is required → non-optional, like the default sorting field. + expect(collection.fields).toContainEqual({ + name: 'status', + type: 'string', + facet: true, + sort: false, + optional: false, + }); + // statusRank is the default_sorting_field, which Typesense requires to be + // non-optional. + expect(collection.fields).toContainEqual({ + name: 'statusRank', + type: 'int64', + facet: false, + sort: true, + optional: false, + }); + expect(collection.fields).toContainEqual({ + name: 'size', + type: 'int64', + facet: true, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'iiif', + type: 'bool', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'publisher', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'datePosted', + type: 'int64', + facet: false, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'score', + type: 'float', + facet: true, + sort: false, + optional: true, + }); + }); + + it('emits a folded, stemmed search companion for a searchable keyword field', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + stem: true, + locale: 'nl', + }); + }); +}); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts new file mode 100644 index 00000000..9b93d134 --- /dev/null +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. The + * derived Typesense collection is snapshotted purely to pin the **generator**: + * any change to how `buildCollectionSchema` maps the field model (Typesense field + * types, the physical fanout, stem/locale, optional/default-sorting-field) + * surfaces as a snapshot diff before this library is published. + */ +const THING: SearchType = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'size', kind: 'integer', facetable: true, sortable: true }, + { name: 'score', kind: 'number', facetable: true }, + { name: 'created', kind: 'date', sortable: true }, + { name: 'open', kind: 'boolean', facetable: true }, + ], +}; + +describe('collection-schema generator stability', () => { + it('derives a stable Typesense collection for a representative schema', () => { + expect( + buildCollectionSchema(THING, { + name: 'things', + defaultSortingField: 'size', + defaultLocale: 'nl', + synonymSets: ['things-synonyms'], + }), + ).toMatchSnapshot(); + }); +}); diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts new file mode 100644 index 00000000..b59b1a47 --- /dev/null +++ b/packages/search-typesense/test/parse-response.test.ts @@ -0,0 +1,465 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; +import type { LocalizedValue, SearchQuery, SearchType } from '@lde/search'; +import type { Client } from 'typesense'; +import { + createTypesenseSearchEngine, + fetchLabels, + parseSearchResponse, +} from '../src/search.js'; + +const schema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'size', kind: 'integer', output: true }, + { name: 'datePosted', kind: 'date', output: true }, + { name: 'iiif', kind: 'boolean', facetable: true, output: true }, + // A non-output field is never reconstructed into the logical document. + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + ], +}; + +const labels = new Map([ + ['https://org/1', { nl: ['Het Utrechts Archief'] }], + ['https://org/2', { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }], +]); + +const response = { + found: 2, + hits: [ + { + document: { + id: 'https://d/1', + title_nl: 'Titel', + title_en: 'Title', + keyword: ['kaarten'], + publisher: ['https://org/1'], + size: 1234, + datePosted: 1_700_000_000, + iiif: true, + status: 'valid', + }, + }, + { + document: { + id: 'https://d/2', + title_nl: 'Andere', + keyword: ['atlas', 'kaart'], + publisher: ['https://org/2', 'https://org/3'], + }, + }, + ], + facet_counts: [ + { + field_name: 'keyword', + counts: [ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ], + }, + { + // A reference facet: buckets are keyed by IRI and carry resolved labels. + field_name: 'publisher', + counts: [ + { value: 'https://org/1', count: 2 }, + { value: 'https://org/3', count: 1 }, + ], + }, + ], +}; + +describe('parseSearchResponse', () => { + const result = parseSearchResponse(response, schema, labels); + + it('carries the total and the facet buckets keyed by field name', () => { + expect(result.total).toBe(2); + // A plain facet: buckets carry no label. + expect(result.facets.keyword).toEqual([ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ]); + }); + + it('attaches resolved labels to reference-facet buckets, id-only when unlabelled', () => { + expect(result.facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { value: 'https://org/3', count: 1 }, + ]); + }); + + it('reconstructs localized text into a best-available language map', () => { + expect(result.hits[0].id).toBe('https://d/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Titel'], + en: ['Title'], + }); + // Only the present locale is emitted. + expect(result.hits[1].document.title).toEqual({ nl: ['Andere'] }); + }); + + it('resolves reference IRIs to labelled references, id-only when unlabelled', () => { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + { id: 'https://org/3' }, + ]); + }); + + it('passes keyword arrays and numeric scalars through, and omits absent fields', () => { + expect(result.hits[0].document.keyword).toEqual(['kaarten']); + expect(result.hits[0].document.size).toBe(1234); + expect(result.hits[0].document.datePosted).toBe(1_700_000_000); + expect(result.hits[1].document.size).toBeUndefined(); + }); + + it('defaults an absent boolean to false and never reconstructs non-output fields', () => { + expect(result.hits[0].document.iiif).toBe(true); + expect(result.hits[1].document.iiif).toBe(false); + expect(result.hits[0].document.status).toBeUndefined(); + }); +}); + +describe('parseSearchResponse range facets', () => { + const rangeSchema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'size', + kind: 'integer', + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + // Open-ended top bin: no upper bound. + { key: '2', min: 100 }, + ], + }, + ], + }; + + const rangeResponse = { + found: 5, + hits: [], + facet_counts: [ + { + field_name: 'size', + counts: [ + { value: '0', count: 2 }, + { value: '1', count: 1 }, + { value: '2', count: 2 }, + ], + }, + ], + }; + + it('echoes each range bin’s half-open bounds onto its bucket, open ends omitted', () => { + const result = parseSearchResponse(rangeResponse, rangeSchema, new Map()); + expect(result.facets.size).toEqual([ + { value: '0', count: 2, min: 1, max: 10 }, + { value: '1', count: 1, min: 10, max: 100 }, + // The open-ended top bin carries only its lower bound. + { value: '2', count: 2, min: 100 }, + ]); + }); +}); + +describe('createTypesenseSearchEngine label degradation', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // A fake client whose document search succeeds but whose label lookup + // (multi_search) rejects, so the engine must degrade to id-only references. + function fakeClient(): Client { + return { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + }), + }), + multiSearch: { + perform: () => + Promise.reject(new Error('labels collection unavailable')), + }, + } as unknown as Client; + } + + it('degrades to id-only references when the label lookup fails, reporting the cause', async () => { + let capturedError: unknown; + const engine = createTypesenseSearchEngine(fakeClient(), { + collection: 'datasets', + labelsCollection: 'labels', + onLabelError: (error) => { + capturedError = error; + }, + }); + const result = await engine.search(baseQuery, schema); + // The reference is present but unlabelled: the failed lookup degraded + // rather than failing the whole search. + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + }); +}); + +describe('createTypesenseSearchEngine label cache (labelCacheTtlMs)', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // One labels document, as the export endpoint streams it (JSONL). + const labelsJsonl = JSON.stringify({ + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + }); + + // A fake client whose document search always returns one hit referencing + // `https://org/1`, and whose `labels` collection export is driven by + // `exportImpl`. Counters make the export-call count observable. + function fakeClient(exportImpl: () => Promise) { + let exportCalls = 0; + const client = { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + export: () => { + exportCalls += 1; + return exportImpl(); + }, + }), + }), + }; + return { + client: client as unknown as Client, + exportCalls: () => exportCalls, + }; + } + + afterEach(() => { + vi.useRealTimers(); + }); + + it('loads the collection once for concurrent searches (single-flight)', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + const results = await Promise.all([ + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + ]); + + // One export served all three concurrent searches. + expect(exportCalls()).toBe(1); + for (const result of results) { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + } + }); + + it('serves a later search from cache without a second export', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + await engine.search(baseQuery, schema); + await engine.search(baseQuery, schema); + + expect(exportCalls()).toBe(1); + }); + + it('reloads the collection after the TTL expires', async () => { + vi.useFakeTimers(); + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 1000, + }); + + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Within the TTL: still cached. + vi.advanceTimersByTime(500); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Past the TTL: reload. + vi.advanceTimersByTime(600); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(2); + }); + + it('degrades to id-only references on a load error and retries next time', async () => { + let capturedError: unknown; + let attempt = 0; + const { client, exportCalls } = fakeClient(() => { + attempt += 1; + return attempt === 1 + ? Promise.reject(new Error('labels collection unavailable')) + : Promise.resolve(labelsJsonl); + }); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + onLabelError: (error) => { + capturedError = error; + }, + }); + + // First load fails: id-only reference, error reported, nothing cached. + const failed = await engine.search(baseQuery, schema); + expect(failed.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + expect(exportCalls()).toBe(1); + + // Next search retries the load (the failure was not cached) and resolves. + const recovered = await engine.search(baseQuery, schema); + expect(recovered.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(exportCalls()).toBe(2); + }); +}); + +describe('fetchLabels', () => { + // A fake Typesense client whose multi_search returns the requested ids that + // exist in `docsById`, recording the id-list of each POST so batching is + // observable. (Resolving via multi_search/POST avoids the GET query-string + // limit that a large id-list would otherwise overflow.) + function fakeClient(docsById: Record>) { + const calls: string[][] = []; + const client = { + multiSearch: { + perform: (request: { searches: { readonly filter_by: string }[] }) => { + const ids = [ + ...request.searches[0].filter_by.matchAll(/`([^`]+)`/g), + ].map((match) => match[1]); + calls.push(ids); + const hits = ids + .filter((id) => docsById[id] !== undefined) + .map((id) => ({ document: { id, ...docsById[id] } })); + return Promise.resolve({ results: [{ found: hits.length, hits }] }); + }, + }, + }; + return { client: client as unknown as Pick, calls }; + } + + it('resolves labels via multi_search, merging per-locale variants', async () => { + const { client, calls } = fakeClient({ + 'https://org/1': { label: 'KB', label_nl: 'KB' }, + // Only a default label (no locale variant) → untagged (`und`) fallback. + 'https://org/3': { label: 'Untagged' }, + }); + const labels = await fetchLabels(client, 'labels', [ + 'https://org/1', + 'https://org/2', + 'https://org/3', + ]); + expect(labels.get('https://org/1')).toEqual({ nl: ['KB'] }); + expect(labels.get('https://org/3')).toEqual({ und: ['Untagged'] }); + // An IRI absent from the collection yields no entry. + expect(labels.has('https://org/2')).toBe(false); + expect(calls).toHaveLength(1); + }); + + it('batches a large id-list under the per_page cap, one POST per batch', async () => { + const ids = Array.from( + { length: 450 }, + (_unused, index) => `https://example.org/class/${index}`, + ); + const docsById = Object.fromEntries( + ids.map((id) => [id, { label_nl: id }]), + ); + const { client, calls } = fakeClient(docsById); + const labels = await fetchLabels(client, 'labels', ids); + // 450 ids → batches of 200, 200, 50. + expect(calls.map((batch) => batch.length)).toEqual([200, 200, 50]); + expect(labels.size).toBe(450); + }); + + it('makes no request for an empty id-list', async () => { + const { client, calls } = fakeClient({}); + const labels = await fetchLabels(client, 'labels', []); + expect(labels.size).toBe(0); + expect(calls).toHaveLength(0); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts new file mode 100644 index 00000000..9a06d0f8 --- /dev/null +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -0,0 +1,201 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchQuery, SearchType } from '@lde/search'; +import { buildSearchParams } from '../src/query-compiler.js'; + +const schema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + // Filter-only, non-facet (tokenized) → exact `:=` membership. + { name: 'catalog', kind: 'keyword', array: true, filterable: true }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + // Half-open `[min, max)` bins; the last is open-ended (no upper bound). + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + { key: '2', min: 100 }, + ], + }, + { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, + ], +}; + +const base: SearchQuery = { + where: [], + orderBy: [], + limit: 20, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('buildSearchParams', () => { + it('browses with a match-all q and the weighted query_by fields', () => { + const params = buildSearchParams(base, schema); + expect(params.q).toBe('*'); + expect(params.query_by).toBe( + 'title_search_nl,title_search_en,keyword_search', + ); + expect(params.per_page).toBe(20); + expect(params.page).toBe(1); + expect(params.filter_by).toBeUndefined(); + expect(params.sort_by).toBeUndefined(); + }); + + it('folds the query text and boosts the active locale in query_by_weights', () => { + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'nl' }, schema), + ).toMatchObject({ q: 'kaart', query_by_weights: '5,4,1' }); + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'en' }, schema) + .query_by_weights, + ).toBe('4,5,1'); + }); + + it('maps offset/limit to numbered pages', () => { + expect( + buildSearchParams({ ...base, offset: 40, limit: 20 }, schema).page, + ).toBe(3); + }); + + it('compiles where clauses, with exact membership for non-facet fields', () => { + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'keyword', in: ['kaarten', 'atlas'] }, + { field: 'catalog', in: ['urn:cat'] }, + { field: 'format', in: ['text/turtle', 'group:rdf'] }, + { field: 'size', range: { min: 1, max: 10 } }, + { field: 'iiif', is: true }, + ], + }, + schema, + ); + expect(params.filter_by).toBe( + 'status:[`valid`] && ' + + 'keyword:[`kaarten`,`atlas`] && ' + + 'catalog:=[`urn:cat`] && ' + + 'format:[`text/turtle`,`group:rdf`] && ' + + 'size:[1..10] && ' + + 'iiif:=true', + ); + }); + + it('compiles a one-sided range bound', () => { + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { min: 5 } }] }, + schema, + ).filter_by, + ).toBe('size:>=5'); + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { max: 9 } }] }, + schema, + ).filter_by, + ).toBe('size:<=9'); + }); + + it('compiles orderBy: RELEVANCE → _text_match and a localized field → its sort key', () => { + expect( + buildSearchParams( + { + ...base, + orderBy: [ + { field: 'relevance', direction: 'desc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('_text_match:desc,status_rank:asc'); + + expect( + buildSearchParams( + { + ...base, + locale: 'nl', + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('title_sort_nl:asc,status_rank:asc'); + }); + + it('pins page to 1 for a facet-only (limit:0) query instead of dividing by zero', () => { + const params = buildSearchParams({ ...base, limit: 0 }, schema); + expect(params.per_page).toBe(0); + expect(params.page).toBe(1); + }); + + it('requests facets by their logical field name', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) + .facet_by, + ).toBe('keyword,format'); + }); + + it('facets a range field into its declared half-open bins, open ends blank', () => { + // Typesense range syntax is start-inclusive/end-exclusive, so the declared + // `[min, max)` bounds pass straight through; the open-ended bin leaves the + // upper bound blank. + expect( + buildSearchParams({ ...base, facets: ['size'] }, schema).facet_by, + ).toBe('size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('mixes range and plain facets in one facet_by clause', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'size'] }, schema) + .facet_by, + ).toBe('keyword,size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('omits max_facet_values by default but sets it when configured', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema) + .max_facet_values, + ).toBeUndefined(); + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema, { + maxFacetValues: 250, + }).max_facet_values, + ).toBe(250); + }); +}); diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts new file mode 100644 index 00000000..32f94a59 --- /dev/null +++ b/packages/search-typesense/test/search-engine.test.ts @@ -0,0 +1,226 @@ +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import type { Client } from 'typesense'; +import type { SearchEngine, SearchQuery, SearchType } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; +import { createTypesenseSearchEngine } from '../src/search.js'; +import { TypesenseContainer } from './typesense-container.js'; + +const datasetSchema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + ], +}; + +// Flat documents, as the projection would emit them (physical field names). +const documents = [ + { + id: 'd1', + title_nl: 'Kaart van Utrecht', + title_en: 'Map of Utrecht', + title_search_nl: 'kaart van utrecht', + title_search_en: 'map of utrecht', + title_sort_nl: 'kaart van utrecht', + title_sort_en: 'map of utrecht', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd2', + title_nl: 'Atlas der Nederlanden', + title_search_nl: 'atlas der nederlanden', + title_sort_nl: 'atlas der nederlanden', + keyword: ['atlas'], + keyword_search: ['atlas'], + publisher: ['https://org/2'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd3', + title_nl: 'Verouderde kaart', + title_search_nl: 'verouderde kaart', + title_sort_nl: 'verouderde kaart', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'invalid', + statusRank: 3, + }, +]; + +const labelDocuments = [ + { + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + type: 'organization', + }, + { + id: 'https://org/2', + label: 'Rijksmuseum', + label_nl: 'Rijksmuseum', + label_en: 'Rijksmuseum', + type: 'organization', + }, +]; + +const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('createTypesenseSearchEngine (integration)', () => { + const container = new TypesenseContainer(); + let client: Client; + let engine: SearchEngine; + + beforeAll(async () => { + client = await container.start(); + // Typesense accepts the generated schema (stemming, locales, int64, …). + await client.collections().create( + buildCollectionSchema(datasetSchema, { + name: 'datasets', + defaultSortingField: 'statusRank', + defaultLocale: 'nl', + }), + ); + await client.collections().create({ + name: 'labels', + fields: [ + { name: 'label', type: 'string' }, + { name: 'label_nl', type: 'string', optional: true, index: false }, + { name: 'label_en', type: 'string', optional: true, index: false }, + { name: 'type', type: 'string', facet: true }, + ], + }); + await client + .collections('datasets') + .documents() + .import(documents, { action: 'create' }); + await client + .collections('labels') + .documents() + .import(labelDocuments, { action: 'create' }); + + engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + }); + }, 120_000); + + afterAll(async () => { + await container.stop(); + }); + + it('filters by status, sorts by the localized title key, and resolves reference labels', async () => { + const result = await engine.search( + { + ...baseQuery, + where: [{ field: 'status', in: ['valid'] }], + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'statusRank', direction: 'asc' }, + ], + }, + datasetSchema, + ); + + // d3 is invalid → filtered out; remaining two sorted by folded title. + expect(result.total).toBe(2); + expect(result.hits.map((hit) => hit.id)).toEqual(['d2', 'd1']); + expect(result.hits[0].document.title).toEqual({ + nl: ['Atlas der Nederlanden'], + }); + expect(result.hits[0].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + }); + + it('ranks a full-text query through the weighted query_by fields', async () => { + const result = await engine.search( + { + ...baseQuery, + text: 'Utrecht', + orderBy: [{ field: 'relevance', direction: 'desc' }], + }, + datasetSchema, + ); + + expect(result.hits[0].id).toBe('d1'); + expect(result.hits.map((hit) => hit.id)).not.toContain('d2'); + }); + + it('returns facet buckets with counts, labelling reference facets', async () => { + const result = await engine.search( + { ...baseQuery, facets: ['keyword', 'publisher'] }, + datasetSchema, + ); + + // Plain facet: value + count, no label. + const keyword = [...(result.facets.keyword ?? [])].sort( + (a, b) => b.count - a.count, + ); + expect(keyword).toEqual([ + { value: 'kaarten', count: 2 }, + { value: 'atlas', count: 1 }, + ]); + + // Reference facet: IRI-keyed buckets carry the resolved data label. + const publisher = [...(result.facets.publisher ?? [])].sort( + (a, b) => b.count - a.count, + ); + expect(publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { + value: 'https://org/2', + count: 1, + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + }); +}); diff --git a/packages/search-typesense/tsconfig.lib.json b/packages/search-typesense/tsconfig.lib.json index e7c2ce37..52ca4bb7 100644 --- a/packages/search-typesense/tsconfig.lib.json +++ b/packages/search-typesense/tsconfig.lib.json @@ -8,7 +8,10 @@ "types": ["node"] }, "include": ["src/**/*.ts"], - "references": [], + "references": [ + { "path": "../search/tsconfig.lib.json" }, + { "path": "../text-normalization/tsconfig.lib.json" } + ], "exclude": [ "vite.config.ts", "vite.config.mts", diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index a09c9579..a6245e7b 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,10 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 87.5, - lines: 84.7, - branches: 66.66, - statements: 84.88, + functions: 97.14, + lines: 93.31, + branches: 83.75, + statements: 93.37, }, }, }, diff --git a/packages/search/README.md b/packages/search/README.md index 5672881e..ca84cd21 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -1,170 +1,175 @@ # @lde/search -Engine-agnostic search projection for RDF-backed pipelines. **`projectGraph`** -streams the result of a SPARQL `CONSTRUCT` into flat search documents, with no -engine and no vocabulary baked in. Internally it does two things per subject of -a root type: frame its one-hop subgraph into a JSON-LD IR node, then project -that node into a flat document from a **declarative field spec**. +The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no +search engine, no API protocol, and no domain vocabulary: you supply a +declarative `SearchSchema`, and engine adapters and API surfaces sit on the ports +defined here. The library never names your domain — the same core drives a +`Dataset`, `Person`, or `CreativeWork` search. + +It provides four things: + +- **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: + one declaration per field that drives all four consumers below, so they + cannot drift; +- **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter + semantics, the shared compiler target every API surface parses into; +- **the engine port** — `SearchEngine` and the logical result types + (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); +- **a streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat + search documents. -An engine adapter (e.g. [`@lde/search-typesense`](../search-typesense)) then -writes those documents to a search backend. +``` +SearchSchema ─┬─► projection (projectGraph → flat documents) [here] + ├─► engine adapter (collection schema + query compiler) e.g. @lde/search-typesense + ├─► query semantics (SearchQuery, filter/sort/facet) [here] + └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql +``` -```ts -import { projectGraph, type Projection } from '@lde/search'; +One field, four consumers — that is why the model is unified: a field’s `kind` +plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / +`output`) describe projection, the engine collection schema, the query semantics, +and the API output in a single place. -const projection: Projection = { - /* type + field spec — see below */ -}; +## Terminology -for await (const document of projectGraph(quads, [projection])) { - // one flat search document per matching subject, streamed -} -``` +The model has three levels, mirroring both SHACL (the source vocabulary) and +GraphQL (one of the surfaces): -`projectGraph` is fully streaming: subjects are grouped and framed one at a time -and documents are yielded as they are produced, so beyond a subject index memory -stays flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate -triples are collapsed first, because some SPARQL engines (e.g. QLever) do not -deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` -reading it sees full predicate IRIs with language tags preserved. +| Term | What it is | SHACL | GraphQL | +| -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| `SearchField` | One queryable field: a `kind`, the IR `path` it projects from, and the capability flags it opts into | property shape | field | +| `SearchType` | One root type’s complete declaration: its `type` IRI plus its fields and derivations | NodeShape | object type | +| `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | -## Projection +`projectGraph` consumes a `SearchSchema` (it projects every type in one pass); +the engine port and the GraphQL surface operate on one `SearchType` at a time. -The mapping is data, not code. Each field declares the IR `path` to read and a -`kind`; the conventions (per-locale split, diacritic folding via -[`@lde/text-normalization`](../text-normalization), facet arrays, numeric -coercion) are applied for you. Computed fields are `derivations` — hooks that -read the node and set fields the kinds can't. +## Field model -```ts -import { projectGraph, irisOf, type Projection } from '@lde/search'; +The mapping is data, not code. Each field declares its `kind`, the IR `path` to +read (omit it for a **derived** field, populated by a `derivation`), and the +capabilities it opts into. The physical field names a declaration fans out to +(per-locale search/sort keys) come from +`physicalFields`, the single convention projection, the collection schema and the +query compiler all share. -const projection: Projection = { +```ts +import { + projectGraph, + irisOf, + searchSchema, + type SearchType, +} from '@lde/search'; + +const DATASET = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ - // → title_nl, title_en, title_search_nl, title_search_en, title_sort_nl, title_sort_en + // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en { name: 'title', path: 'http://purl.org/dc/terms/title', - kind: { - type: 'langText', - locales: ['nl', 'en'], - display: true, - search: true, - sort: true, - }, + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, }, - // → publisher (IRI facet) + // → publisher (IRI facet, resolved to a labelled reference at the surface) { name: 'publisher', path: 'http://purl.org/dc/terms/publisher', - kind: { type: 'facet', iri: true }, + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, }, // → size (int) - { name: 'size', path: 'urn:dr:size', kind: { type: 'number' } }, + { name: 'size', path: 'urn:dr:size', kind: 'integer', sortable: true }, + // derived field (no path): populated by the derivation below + { name: 'classCount', kind: 'integer', sortable: true }, ], derivations: [ (document, node) => { - document.class_count = irisOf(node, 'urn:dr:class').length; + document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -}; +} as const satisfies SearchType; -for await (const document of projectGraph(quads, [projection])) { - // … +for await (const document of projectGraph(quads, searchSchema(DATASET))) { + // one flat search document per matching subject, streamed } ``` -**Kinds** +Capturing the type with `as const satisfies SearchType` keeps the field +literals, so the API surface can derive typed facet/output keys from it (see +`@lde/search-api-graphql`). -| kind | emits | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `langText` | per locale (see below), each opt-in: `_${locale}` display with `display`, `_search_${locale}` folded with `search`, `_sort_${locale}` folded with `sort` | -| `facet` | the field as a deduped array; `iri` reads `@id`; `search` adds a folded `_search`; `transform` rewrites values | -| `number` | a numeric scalar; `date` parses an ISO date-time to unix seconds | +**Kinds** (`FieldKind`): `text`, `keyword`, `integer`, `number`, `boolean`, +`date`, `reference`. The Typesense/engine vocabulary and the GraphQL types are +_derived_ from the kind by the adapter and the surface — never declared here. + +| kind | `where` | facet | sort | output | +| -------------------- | -------------------- | ----- | ---------------- | ------------------------------- | +| `text` (`localized`) | – (feeds free text) | – | yes (per-locale) | best-first language list | +| `keyword` | `in` (membership) | yes | – | string / `string[]` | +| `reference` | `in` (membership) | yes | – | labelled reference (id + label) | +| `integer` / `number` | `range { min, max }` | yes | yes | number | +| `date` | `range` (inclusive) | yes | yes | ISO 8601 string (surface) | +| `boolean` | `is` | yes | – | boolean (absent = false) | + +## Projection + +`projectGraph` is fully streaming: subjects are grouped and framed one at a time +and documents are yielded as produced, so beyond a subject index memory stays +flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate +triples are collapsed first, because some SPARQL engines (e.g. QLever) do not +deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` +reading it sees full predicate IRIs with language tags preserved. ## Locales -`locales` is the **single** list of languages a `langText` field projects; -`display`, `search` and `sort` are independent opt-in families that each fan out +`locales` is the **single** list of languages a localized `text` field projects; +`output`, `searchable` and `sortable` are independent opt-ins that each fan out over it (so a field emits exactly what it opts into): -- `display` → `title_nl`/`title_en` (accents preserved); -- `search` → `title_search_nl`/`title_search_en` (folded; one field per locale - lets a query `query_by` them and rank the user’s language higher via - `query_by_weights`, and lets a language that needs a dedicated tokenizer set - its own `locale` in the schema); -- `sort` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI +- `output` → `title_nl`/`title_en` (accents preserved); +- `searchable` → `title_search_nl`/`title_search_en` (folded; one field per locale + lets a query `query_by` them and rank the user’s language higher, and lets a + language that needs a dedicated tokenizer set its own stemming `locale` in the + engine schema); +- `sortable` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI sorts on the active language). -A field with `search` but no `display` is **search-only** — folded and stemmed -for retrieval but never rendered (e.g. a `publisher` searched here but shown via -a separate label). +A field with `searchable` but no `output` is **search-only** — folded and stemmed +for retrieval but never rendered (e.g. a creator searched here but shown via a +separate label). **Only listed locales are indexed**; a literal whose language tag +is not in `locales` (or has no tag) is not projected at all. Per-locale fields are +**omitted, never empty**, when a document lacks that language, so declare them +optional in the engine schema and sort with `missing_values: last`. Folding the search fields is what lets diacritic-insensitive matching and stemming coexist. A search engine on its **default** locale typically folds case -and diacritics for you (Typesense v30, verified, even folds ø/æ/ß) — so there the -folding here is belt-and-suspenders. But enabling a language’s **stemming** -requires setting that language’s `locale` (e.g. `locale: 'nl'` + `stem: true` so -`huizen` matches `huis`), and a non-default locale switches the engine to ICU -tokenization, which **preserves** diacritics. At that point the engine no longer -folds them, and `fold()` is what keeps matching diacritic-insensitive. Stemming -is a per-field engine-schema choice (the consumer’s), and being rules-based it -can mangle proper nouns and place names — e.g. the Dutch stemmer reduces the city -`Bergen` to `berg`, colliding it with “mountain”. - -Recommended split: enable stemming on the **free-text** search fields -(`*_search_${locale}`, descriptions, keywords) where morphological recall helps -(`verhaal` ↔ `verhalen`), and keep **place names and other proper-noun facets on -a separate, unstemmed field** (facets are exact-match anyway). That captures the -recall without the `Bergen`/`berg` collision in the facet. A `stem_dictionary` -can pin specific names if you need stemmed free-text without given collisions. - -**Only listed locales are indexed.** A literal whose language tag is not in -`locales` is not projected at all — no display, no search, no sort field — so it -is invisible to the index. To index a language, add it to `locales`. - -Per-locale fields are **omitted, never empty**, when a document lacks that -language, so declare them `optional: true` in the engine schema. At query time, -sort with `missing_values: last` to push documents lacking the active locale to -the end, and `query_by` all the per-locale search fields (weighting the user’s -locale higher) to keep cross-language recall. - -A literal with no `@language` tag matches no locale, so it is not projected. Tag -your source literals (or pre-process them) for the languages you index. +and diacritics for you; enabling a language’s **stemming** switches it to ICU +tokenization, which **preserves** diacritics — at which point `fold()` (from +[`@lde/text-normalization`](../text-normalization)) is what keeps matching +diacritic-insensitive. Stemming is rules-based and can mangle proper nouns (the +Dutch stemmer reduces the city `Bergen` to `berg`), so enable it on free-text +fields and keep proper-noun facets on a separate, unstemmed field. ## Querying The search fields are stored already case- and diacritic-folded, so **the query -must be folded the same way** with the same `fold()` from -[`@lde/text-normalization`](../text-normalization) before it reaches the engine. -Otherwise index and query are normalized differently and matches silently miss -(the user sees no results, with no error). An engine on its default locale would -fold a raw query for you, but one set to a stemming locale (which preserves -diacritics) or a non-folding backend will not — so always fold, and matching -stays correct on any engine. - -```ts -import { fold } from '@lde/text-normalization'; - -await client - .collections(collection) - .documents() - .search({ - q: fold(userQuery), - query_by: 'title_search_nl,title_search_en', - query_by_weights: '2,1', // rank the user’s locale higher - }); -``` - -This contract holds for **any** consumer, including a search API built on top of -this package: index-time and query-time folding must use the same `fold()`, or -non-decomposing terms silently miss. - -## Why a spec - -The field spec's vocabulary mirrors SHACL on purpose: `path` is `sh:path`, and -the kind is derivable from `sh:datatype` / `sh:nodeKind` / `sh:maxCount` plus -search annotations. So the same projection engine that runs a hand-written spec -today will run a **SHACL-generated** spec tomorrow — the engine and the IR stay; -only spec-authoring gets automated. Nothing is thrown away. +must be folded the same way** with the same `fold()` before it reaches the engine, +or index and query normalize differently and matches silently miss. This contract +holds for **any** consumer, including an API built on this package — which is why +engine adapters and surfaces compile through the shared `SearchQuery` IR and the +`physicalFields` convention rather than re-deriving field names. + +## Why a declarative model + +The vocabulary mirrors SHACL on purpose: `path` is `sh:path`, `array` is +`sh:maxCount`, `required` is `sh:minCount`, `localized` is `sh:languageIn`, `ref` +is `sh:class`/`sh:node`. So the same core that runs a hand-written `SearchSchema` +today will run a **SHACL-generated** one tomorrow — the model, the ports and the +IR stay; only schema-authoring gets automated. diff --git a/packages/search/package.json b/packages/search/package.json index 61657f95..6e7414c9 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine-agnostic search projection for RDF-backed pipelines: frame CONSTRUCT quads into a JSON-LD IR, then project that IR into flat search documents from a declarative field spec (the artifact a SHACL generator would emit)", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchType/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts new file mode 100644 index 00000000..1a47bd6b --- /dev/null +++ b/packages/search/src/engine.ts @@ -0,0 +1,149 @@ +import type { SearchQuery } from './query.js'; +import type { SearchType } from './schema.js'; + +/** + * The engine port — the boundary a concrete engine adapter (e.g. + * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter + * owns every engine specific (companion-field expansion, full-text field + * selection and weights, filter compilation, sorting, result folding, faceting) + * and returns only logical + * documents, so a deployment can swap engines without any consumer noticing. + * Nothing engine-specific and nothing RDF-specific leaks past this port. + * + * `FacetField` keys the returned facet map; it defaults to `string` so an engine + * stays ergonomic, and a deployment can narrow it to its own facet-field union + * (see {@link FacetFieldsOf}) for typo-safe facet access. + */ +export interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + searchType: SearchType, + ): Promise>; +} + +/** What an engine returns: logical hits, a total, and the requested facets. */ +export interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; + readonly total: number; + readonly facets: FacetMap; +} + +/** + * Facet buckets keyed by facet field name. `Partial` because a result carries + * buckets only for the fields the query asked for, not every facetable field. + */ +export type FacetMap = Readonly< + Partial> +>; + +/** + * The facet-field-name union of a search type — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the type be captured as a literal + * (`as const satisfies SearchType`), so the `facetable: true` flags survive as + * literals; a plain `: SearchType` annotation widens them and yields `never`. + */ +export type FacetFieldsOf = Extract< + Type['fields'][number], + { readonly facetable: true } +>['name']; + +/** + * The output-field-name union of a search type — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the type captured as a literal + * (`as const satisfies SearchType`). + */ +export type OutputFieldsOf = Extract< + Type['fields'][number], + { readonly output: true } +>['name']; + +/** A {@link SearchEngine} narrowed to one search type: facet keys and document + * keys fixed to that type’s facetable / output field names. The type must be + * captured as `as const satisfies SearchType`. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf +>; + +/** A {@link SearchResult} narrowed to one search type (see {@link EngineFor}). */ +export type ResultFor = SearchResult< + FacetFieldsOf, + OutputFieldsOf +>; + +/** + * One result row. `id` (the stable document key, an IRI) is kept *out* of + * {@link ResultDocument}: it is always present and is the hit’s identity, a + * different contract from the optional, typed logical field values — and it maps + * straight onto the GraphQL output’s guaranteed `id: String!`. The document + * holds only the selectable fields. + */ +export interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +/** + * The logical result document at the query seam — engine- and RDF-neutral. + * Distinct from the flat, fanned-out projection `SearchDocument` that lives + * index-side: this carries logical fields with language maps and references, + * ready for a surface to shape. Keyed by output field name; `Partial` because a + * document omits absent optional fields. `OutputField` defaults to `string`; a + * deployment narrows it via {@link OutputFieldsOf} for typo-safe field access. + */ +export type ResultDocument = Readonly< + Partial> +>; + +/** A logical field value. */ +export type SearchValue = + | string + | number + | boolean + | readonly string[] + | LocalizedValue + | Reference + | readonly Reference[]; + +/** + * A JSON-LD-style language map (`@container: @language`, `@set` arrays); the key + * `und` carries untagged (`@none`) values. The surface flattens it to a + * best-first `Accept-Language`-ordered list. + */ +export type LocalizedValue = Readonly>; + +/** + * The generic internal carrier for a referenced entity. The GraphQL surface maps + * it to a named per-shape type (e.g. `Organization`, `Term`) with `label` + * exposed as `name`. + */ +export interface Reference { + readonly id: string; + readonly label?: LocalizedValue; +} + +/** + * One facet bucket: a value and how many documents carry it. `label` is the + * engine-resolved canonical **data** label, present only for reference facets + * (IRI-keyed); it is absent for facets whose value is a token or free string + * whose display the consumer owns (its own i18n, or the value itself). + */ +export interface FacetBucket { + readonly value: string; + readonly count: number; + readonly label?: LocalizedValue; + /** + * For a range-facet bucket: its half-open bounds (`min` inclusive, `max` + * exclusive), echoing the declared {@link FacetRange} so the bucket is + * self-describing and a consumer never hardcodes the bin formula. Both absent + * for a value facet; either absent for an open-ended bin. + */ + readonly min?: number; + readonly max?: number; +} diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 10c2b32f..5f86c025 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,50 @@ +// Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified +// SearchField/SearchType model below (one declaration; the fanout names come +// from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; +export type { SearchDocument } from './project.js'; + +// Unified field model: one declaration drives projection, engine collection +// schema, query semantics and the GraphQL surface. Plus the field selectors and +// the physical field-name convention they all share. +export { + searchSchema, + physicalFields, + searchableFields, + facetableFields, + filterableFields, + sortableFields, + outputFields, +} from './schema.js'; export type { - SearchDocument, - Projection, - FieldSpec, FieldKind, - LangTextKind, - FacetKind, - NumberKind, - DateKind, + SearchField, + SearchType, + SearchSchema, Derivation, -} from './project.js'; + PhysicalFields, + FacetRange, +} from './schema.js'; + +// Engine- and protocol-neutral query IR + filter semantics. +export { filterOperatorFor, filterOperator, acceptsFilter } from './query.js'; +export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; + +// Engine port + the logical result document returned across it. +export type { + SearchEngine, + SearchResult, + SearchHit, + ResultDocument, + SearchValue, + LocalizedValue, + Reference, + FacetBucket, + FacetMap, + FacetFieldsOf, + OutputFieldsOf, + EngineFor, + ResultFor, +} from './engine.js'; + export type { FramedNode } from './frame-by-type.js'; diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index c181978f..5aede395 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -1,135 +1,56 @@ import type { Quad } from '@rdfjs/types'; import { fold } from '@lde/text-normalization'; import { frameByType, type FramedNode } from './frame-by-type.js'; +import { + physicalFields, + type SearchField, + type SearchSchema, + type SearchType, +} from './schema.js'; /** A flat search document. `id` is the engine document key. */ export type SearchDocument = { id: string } & Record; -/** - * How one framed-IR property projects into search fields. The vocabulary mirrors - * SHACL so a generator can later emit it from shapes + search annotations: - * `path` is `sh:path`, and the kind is derivable from `sh:datatype`/`sh:nodeKind` - * /`sh:maxCount` plus the search annotations. - */ -export type FieldKind = LangTextKind | FacetKind | NumberKind | DateKind; - -/** - * Language-tagged text, projected per locale. `locales` is the single source of - * truth for which languages this field emits; `display`, `search` and `sort` are - * three independent opt-in families that each fan out over it: - * - `display` → `${name}_${locale}` display label, accents preserved; - * - `search` → `${name}_search_${locale}` folded match field (one per locale so - * the engine can tokenize/stem each language and the query can rank the user’s - * locale higher); - * - `sort` → `${name}_sort_${locale}` folded sort key (one per locale so a - * locale-switching UI sorts on the active language). - * - * All three default off — a field emits exactly the families it opts into (e.g. - * `search` alone is a search-only field, shown via a separate label). Only listed - * locales are projected: a value whose language tag is not in `locales` (and is - * not mapped in by `untaggedLanguage`) is not indexed at all. - */ -export interface LangTextKind { - readonly type: 'langText'; - /** The languages to project; drives whichever of the families are enabled. */ - readonly locales: readonly string[]; - /** Emit the per-locale display labels `${name}_${locale}` (accents preserved). */ - readonly display?: boolean; - /** Emit a folded `${name}_search_${locale}` per locale (matchable). */ - readonly search?: boolean; - /** Emit a folded `${name}_sort_${locale}` per locale (sortable). */ - readonly sort?: boolean; -} - -/** A faceted multi-value field, optionally also folded for search. */ -export interface FacetKind { - readonly type: 'facet'; - /** Read IRI references (`@id`) rather than literal values. */ - readonly iri?: boolean; - /** Also emit a folded `${name}_search` array. */ - readonly search?: boolean; - /** Transform each value before faceting (e.g. strip a media-type prefix). */ - readonly transform?: (value: string) => string; -} - -/** A numeric scalar. */ -export interface NumberKind { - readonly type: 'number'; -} - -/** An ISO date-time, parsed into Unix seconds. */ -export interface DateKind { - readonly type: 'date'; -} - -/** - * One field of a projection: an output `name`, the framed-IR predicate `path` to - * read (the SHACL `sh:path`), and the kind-specific config discriminated by - * `type`. - */ -export type FieldSpec = { - /** Output field base name; per-kind suffixes are appended. */ - readonly name: string; - /** Framed-IR predicate IRI to read (the SHACL `sh:path`). */ - readonly path: string; -} & FieldKind; - -/** A computed field that is not a direct projection of a single path - * (e.g. a status rank, or a group derived from a code table). */ -export type Derivation = (document: SearchDocument, node: FramedNode) => void; - -/** - * One root type’s complete projection — the runtime form of a single SHACL - * NodeShape: `type` is its `sh:targetClass` (and the framed node’s `@type`), - * `fields` are its property shapes, and `derivations` are its `sh:rule`-shaped - * computed fields. A generator emits one of these per NodeShape. - */ -export interface Projection { - readonly type: string; - readonly fields: readonly FieldSpec[]; - readonly derivations?: readonly Derivation[]; -} - /** * Project one framed JSON-LD node into a flat search document: apply each field - * spec, then run the derivations (which may read fields the specs already set). + * of the type, then run the derivations (which may read fields the field specs + * already set). The physical field names a field fans out to come from + * {@link physicalFields}, the single source shared with the engine collection + * schema and the query compiler. */ export function projectDocument( node: FramedNode, - projection: Projection, + searchType: SearchType, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${projection.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${searchType.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of projection.fields) { + for (const field of searchType.fields) { applyField(document, node, field); } - for (const derive of projection.derivations ?? []) { + for (const derive of searchType.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every projection’s root type and project each node with its - * type’s projection — the multi-shape pipeline. Streams one document at a time - * so memory stays flat. The IR maps to a projection by type, so adding a shape - * is adding a `Projection` (no engine change). + * Frame `quads` for every root type in the schema and project each node with its + * type’s declaration — the multi-shape pipeline. Streams one document at a time + * so memory stays flat. The IR maps to a declaration by type, so adding a shape + * is adding a `SearchType` to the schema (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - projections: readonly Projection[], + schema: SearchSchema, ): AsyncIterable { - const byType = new Map( - projections.map((projection) => [projection.type, projection]), - ); - for (const projection of byType.values()) { - for await (const node of frameByType(quads, projection.type)) { - yield projectDocument(node, projection); + for (const searchType of schema.values()) { + for await (const node of frameByType(quads, searchType.type)) { + yield projectDocument(node, searchType); } } } @@ -137,77 +58,102 @@ export async function* projectGraph( function applyField( document: SearchDocument, node: FramedNode, - field: FieldSpec, + field: SearchField, ): void { - switch (field.type) { - case 'langText': - return applyLangText(document, langValuesOf(node, field.path), field); - case 'facet': - return applyFacet(document, node, field); + const path = field.path; + if (path === undefined) { + // A derived field — populated by a derivation, not projected from a path. + return; + } + switch (field.kind) { + case 'text': + return applyLocalizedText(document, langValuesOf(node, path), field); + case 'keyword': + return applyFacet(document, literalsOf(node, path), field); + case 'reference': + return applyFacet(document, irisOf(node, path), field); + case 'integer': + return setNumber( + document, + field.name, + toInteger(firstLiteralOf(node, path)), + ); case 'number': return setNumber( document, field.name, - toInteger(firstLiteralOf(node, field.path)), + toNumber(firstLiteralOf(node, path)), ); case 'date': return setNumber( document, field.name, - isoToUnix(firstLiteralOf(node, field.path)), + isoToUnix(firstLiteralOf(node, path)), ); } + // `boolean` is not projected from a path in current search types — booleans are + // derivation-populated (e.g. the compatibility vinkjes). } -function applyLangText( +/** + * Project a language-tagged text field per locale. Display shows one label + * (accents preserved) when the field is `output`; sort keys off that same + * primary value (folded) when `sortable`; search folds every value of the locale + * when `searchable`, so all are matchable. Absent locales emit nothing. + */ +function applyLocalizedText( document: SearchDocument, values: readonly LangValue[], - { name, locales, display, search, sort }: Extract, + field: SearchField, ): void { + const locales = field.locales ?? []; if (locales.length === 0) { throw new Error( - `langText field “${name}” must declare at least one locale; nothing would be projected otherwise.`, + `Localized text field “${field.name}” must declare at least one locale; nothing would be projected otherwise.`, ); } - for (const locale of locales) { + const names = physicalFields(field); + locales.forEach((locale, index) => { const localeValues = values .filter((value) => value.lang === locale) .map((value) => value.value); if (localeValues.length === 0) { - continue; + return; } - // Display shows one label (accents preserved); sort keys off that same - // primary value (folded); search folds every value of the locale so all - // are matchable. Absent locales emit nothing (the field stays optional). const [primary] = localeValues; - if (display) { - setString(document, `${name}_${locale}`, primary); + if (field.output) { + setString(document, names.display[index], primary); } - if (search) { + if (field.searchable) { setString( document, - `${name}_search_${locale}`, + names.search[index], fold(localeValues.join(' ')).trim(), ); } - if (sort) { - setString(document, `${name}_sort_${locale}`, fold(primary)); + if (field.sortable) { + setString(document, names.sort[index], fold(primary)); } - } + }); } +/** + * Project a faceted multi-value field: dedupe (after the optional transform), + * write the value field, and — when `searchable` — a folded `${name}_search` + * array. `keyword` reads literals; `reference` reads IRIs (the caller passes the + * already-read raw values). + */ function applyFacet( document: SearchDocument, - node: FramedNode, - { name, path, iri, search, transform }: Extract, + raw: readonly string[], + field: SearchField, ): void { - const raw = iri ? irisOf(node, path) : literalsOf(node, path); - const values = dedupe(transform ? raw.map(transform) : raw); - setArray(document, name, values); - if (search) { + const values = dedupe(field.transform ? raw.map(field.transform) : raw); + setArray(document, field.name, values); + if (field.searchable) { setArray( document, - `${name}_search`, + physicalFields(field).search[0], dedupe(values.map((value) => fold(value))), ); } @@ -296,6 +242,10 @@ function toInteger(literal: string | undefined): number | undefined { return literal === undefined ? undefined : Math.trunc(Number(literal)); } +function toNumber(literal: string | undefined): number | undefined { + return literal === undefined ? undefined : Number(literal); +} + function isoToUnix(iso: string | undefined): number | undefined { if (iso === undefined) { return undefined; diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts new file mode 100644 index 00000000..d009ea75 --- /dev/null +++ b/packages/search/src/query.ts @@ -0,0 +1,95 @@ +import type { FieldKind, SearchField } from './schema.js'; + +/** + * The engine- and protocol-neutral query IR. Every API surface parses its input + * into this; the engine adapter consumes it. It is the shared compiler target + * that keeps the GraphQL surface, a later REST surface and the adapter from + * drifting. + */ +export interface SearchQuery { + /** Free-text query; `undefined`/`''` means browse (no text ranking). */ + readonly text?: string; + /** AND across fields. */ + readonly where: readonly Filter[]; + /** Primary public sort plus any server tie-breaks, in precedence order. */ + readonly orderBy: readonly Sort[]; + /** Numbered pagination. */ + readonly limit: number; + readonly offset: number; + /** Logical field names to return facet buckets for. */ + readonly facets: readonly string[]; + /** Selects the per-locale fields to query/sort on (from `Accept-Language`). */ + readonly locale: string; +} + +/** + * One `where` clause. The operator is fixed by the target field’s {@link FieldKind} + * ({@link filterOperatorFor}): keyword/reference use `in` (OR within the field), + * the numeric/date kinds use an inclusive `range`, boolean uses `is`. Bounds are + * inclusive only — no `gt`/`gte`/`lt`/`lte`. + */ +export type Filter = + | { readonly field: string; readonly in: readonly string[] } + | { + readonly field: string; + readonly range: { + readonly min?: number | string; + readonly max?: number | string; + }; + } + | { readonly field: string; readonly is: boolean }; + +/** A single sort dimension. */ +export interface Sort { + readonly field: string; + readonly direction: 'asc' | 'desc'; +} + +/** The `where` operator a kind accepts, or `undefined` when it is not filterable + * through `where` (`text` feeds the free-text `query` instead). */ +export type FilterOperator = 'in' | 'range' | 'is'; + +const OPERATOR_BY_KIND: Readonly< + Record +> = { + text: undefined, + keyword: 'in', + reference: 'in', + integer: 'range', + number: 'range', + date: 'range', + boolean: 'is', +}; + +/** + * The `where` operator a field of this kind accepts (per the ADR filter-semantics + * table), or `undefined` for `text` — which feeds the free-text `query` rather + * than `where`. Drives both the surface’s `where` input type and the adapter’s + * filter compiler from one rule. + */ +export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { + return OPERATOR_BY_KIND[kind]; +} + +/** The operator a concrete {@link Filter} carries, from its shape. */ +export function filterOperator(filter: Filter): FilterOperator { + if ('in' in filter) { + return 'in'; + } + if ('range' in filter) { + return 'range'; + } + return 'is'; +} + +/** + * Whether `field` can be filtered by `filter`: the field must be `filterable` + * and the filter’s shape must be the operator its kind accepts. Surfaces use it + * to reject malformed `where` input before it reaches the adapter. + */ +export function acceptsFilter(field: SearchField, filter: Filter): boolean { + return ( + field.filterable === true && + filterOperator(filter) === filterOperatorFor(field.kind) + ); +} diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts new file mode 100644 index 00000000..7a687925 --- /dev/null +++ b/packages/search/src/schema.ts @@ -0,0 +1,215 @@ +import type { FramedNode } from './frame-by-type.js'; +import type { SearchDocument } from './project.js'; + +/** + * The engine-neutral kind of a queryable field — the runtime form of one SHACL + * property shape’s datatype/nodeKind. It drives every downstream behavior: + * which physical fields the projection emits, the engine collection-schema + * type, the `where`/facet/sort semantics, and the GraphQL output/input type. + * The Typesense-vocabulary types (`string`, `int32`, …) are *derived* from this + * by the engine adapter, never declared here. + */ +export type FieldKind = + | 'text' + | 'keyword' + | 'integer' + | 'number' + | 'boolean' + | 'date' + | 'reference'; + +/** + * One queryable field — the single declarative source that drives all four + * consumers (projection, engine collection schema, query semantics, and the + * GraphQL surface). The vocabulary mirrors SHACL + the `search:` annotations so + * a generator can later emit it unchanged from shapes: + * `kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, + * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`. + * + * Capability flags (`searchable`/`filterable`/`facetable`/`sortable`/`output`) + * are independent opt-ins: a field exposes exactly the roles it declares. A + * field with no `path` is a **derived field** — populated by a + * {@link Derivation} rather than projected from the IR — yet it still carries + * full query/schema/output behavior (e.g. `status`, the compatibility booleans). + * + * The physical field names a declaration fans out to (per-locale search/sort + * keys) follow one convention, owned by + * {@link physicalFields} so projection, collection-schema and query compiler + * cannot disagree. + */ +export interface SearchField { + /** Logical API name; the physical fanout derives from it. Declare camelCase + * where it surfaces in GraphQL. */ + readonly name: string; + readonly kind: FieldKind; + /** Framed-IR predicate IRI to project from (the SHACL `sh:path`). Omit for a + * derivation-populated field. */ + readonly path?: string; + /** Multi-valued (`sh:maxCount > 1`). */ + readonly array?: boolean; + /** Always present (`sh:minCount ≥ 1`): a non-null scalar in the API output and + * a non-optional field in the engine index. Moot for arrays/booleans/`id`, + * which are non-null regardless. */ + readonly required?: boolean; + /** Language-tagged text (`rdf:langString`); projected per locale. `text` only. */ + readonly localized?: boolean; + /** When `localized`, the languages to emit (the per-locale fanout). */ + readonly locales?: readonly string[]; + /** Appears in the API output type / carries a display label. */ + readonly output?: boolean; + /** Full-text inclusion with a `query_by` weight (folded; per-locale when + * `localized`). Presence is what makes a field searchable. */ + readonly searchable?: { readonly weight: number }; + /** Usable in `where`. */ + readonly filterable?: boolean; + /** Returned as facet buckets. */ + readonly facetable?: boolean; + /** Publicly selectable in `orderBy`; localized text also emits a folded sort key. */ + readonly sortable?: boolean; + /** For `kind: 'reference'`: the referenced shape and how much of it to carry. */ + readonly ref?: { + readonly type: string; + readonly strategy: 'labelOnly' | 'idOnly' | 'inline'; + }; + /** Projection-time value transform (e.g. strip a media-type prefix). */ + readonly transform?: (value: string) => string; + /** + * Range-facet bins for a numeric (`integer`/`number`/`date`) facetable field. + * When set, the field facets into these fixed half-open `[min, max)` ranges (a + * histogram) rather than one bucket per distinct value — the per-bucket counts + * a UI slider needs. Bins are query-time only (no index impact) and + * engine-neutral: the Typesense adapter emits a `facet_by` range, an + * Elasticsearch adapter a `range` aggregation. See {@link FacetRange}. + */ + readonly facetRanges?: readonly FacetRange[]; +} + +/** + * One half-open `[min, max)` range-facet bin: `min` inclusive, `max` exclusive, + * so contiguous bins partition cleanly with no boundary double-counting. Omit + * `min` (or `max`) for an open-ended bin (`< max`, resp. `≥ min`). `key` is the + * bucket’s stable label, echoed back as the {@link FacetBucket} `value`. + */ +export interface FacetRange { + readonly key: string; + readonly min?: number; + readonly max?: number; +} + +/** + * A computed field that is not a direct projection of a single path — a status + * rank, a compatibility boolean. Reads + * the framed node and writes onto the flat document the field specs already + * populated. + */ +export type Derivation = (document: SearchDocument, node: FramedNode) => void; + +/** + * One root type’s complete search declaration — the runtime form of a single + * SHACL NodeShape: `type` is its `sh:targetClass`, `fields` are its property + * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed + * fields. A generator emits one of these per NodeShape. + */ +export interface SearchType { + readonly type: string; + readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; +} + +/** + * The complete search declaration of a deployment: every root {@link SearchType}, + * keyed by its `type` IRI — the runtime form of a whole SHACL shapes graph. + * Build one with {@link searchSchema}. + */ +export type SearchSchema = ReadonlyMap; + +/** Build a {@link SearchSchema} from root-type declarations, keyed by `type`. */ +export function searchSchema(...types: readonly SearchType[]): SearchSchema { + return new Map(types.map((searchType) => [searchType.type, searchType])); +} + +/** + * The physical engine fields one {@link SearchField} fans out into, grouped by + * the role each plays. The single source of truth for the naming convention, so + * the projection (writes them), the collection schema (declares them) and the + * query compiler (reads them) cannot disagree. + */ +export interface PhysicalFields { + /** The lone stored field for a non-localized kind — faceted, filtered, sorted + * and output directly. Absent for localized text (its value lives per locale). */ + readonly value?: string; + /** Per-locale output labels `${name}_${locale}` (localized text, `output`). */ + readonly display: readonly string[]; + /** Folded match fields: `${name}_search_${locale}` per locale (localized) or a + * single `${name}_search` (non-localized), when `searchable`. */ + readonly search: readonly string[]; + /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, + * `sortable`); a non-localized field sorts on its `value`. */ + readonly sort: readonly string[]; +} + +/** + * Full-text searchable fields, highest `query_by` weight first — the order the + * engine adapter weights `query_by` in. A field is searchable iff it carries a + * `searchable` weight. + */ +export function searchableFields( + searchType: SearchType, +): readonly (SearchField & { + readonly searchable: { readonly weight: number }; +})[] { + return searchType.fields + .filter( + (field): field is SearchField & { searchable: { weight: number } } => + field.searchable !== undefined, + ) + .sort((left, right) => right.searchable.weight - left.searchable.weight); +} + +/** Fields returned as facet buckets, in declaration order. */ +export function facetableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.facetable === true); +} + +/** Fields usable in `where`, in declaration order. */ +export function filterableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.filterable === true); +} + +/** Fields publicly selectable in `orderBy`, in declaration order. */ +export function sortableFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.sortable === true); +} + +/** Fields that appear in the API output type, in declaration order. */ +export function outputFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.output === true); +} + +/** Derive the physical engine field names a declaration produces. */ +export function physicalFields(field: SearchField): PhysicalFields { + const localized = field.kind === 'text' && field.localized === true; + const locales = localized ? (field.locales ?? []) : []; + return { + // Localized text has no single value field — its values live in the + // per-locale fields; every other kind stores into one `${name}` field. + value: localized ? undefined : field.name, + display: + localized && field.output + ? locales.map((locale) => `${field.name}_${locale}`) + : [], + search: field.searchable + ? localized + ? locales.map((locale) => `${field.name}_search_${locale}`) + : [`${field.name}_search`] + : [], + sort: + localized && field.sortable + ? locales.map((locale) => `${field.name}_sort_${locale}`) + : [], + }; +} diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts new file mode 100644 index 00000000..14966451 --- /dev/null +++ b/packages/search/test/engine.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from 'vitest'; +import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; +import type { SearchQuery } from '../src/query.js'; +import type { SearchType } from '../src/schema.js'; + +const schema: SearchType = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], +}; + +// A fake engine: the port is implementable and the result types compose into a +// logical document (language map + reference) the way a real engine returns. +const fake: SearchEngine = { + async search(query: SearchQuery): Promise { + return { + total: 1, + hits: [ + { + id: 'https://example/dataset/1', + document: { + title: { nl: ['Erfgoed'], und: [query.text ?? ''] }, + publisher: { + id: 'https://example/org/1', + label: { nl: ['Archief'] }, + }, + keyword: ['kaarten', 'atlas'], + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, + }; + }, +}; + +describe('SearchEngine port', () => { + it('returns logical hits, total and facets through the port', async () => { + const query: SearchQuery = { + text: 'kaart', + where: [], + orderBy: [{ field: 'relevance', direction: 'desc' }], + limit: 20, + offset: 0, + facets: ['keyword'], + locale: 'nl', + }; + + const result = await fake.search(query, schema); + + expect(result.total).toBe(1); + expect(result.hits[0].id).toBe('https://example/dataset/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Erfgoed'], + und: ['kaart'], + }); + expect(result.facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); +}); + +describe('typed facet and document keys', () => { + it('keys facets and the result document by the schema’s field names', async () => { + // Captured as a literal (`as const satisfies`) so the `facetable`/`output` + // flags survive and the `…Of` helpers can read the field names off the type. + const datasetSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + { name: 'format', kind: 'keyword', array: true, facetable: true }, + { name: 'status', kind: 'keyword', facetable: true }, + ], + } as const satisfies SearchType; + + // facets ⊂ { format, status }, document keys ⊂ { title }. These object + // literals would not compile if the helpers widened to `string`/`never`. + const engine: EngineFor = { + async search() { + return { + total: 1, + hits: [ + { + id: 'https://example/d/1', + document: { title: { nl: ['Titel'] } }, + }, + ], + facets: { format: [{ value: 'text/turtle', count: 2 }] }, + }; + }, + }; + + const result = await engine.search( + { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: ['format'], + locale: 'nl', + }, + datasetSchema, + ); + + expect(result.facets.format).toEqual([{ value: 'text/turtle', count: 2 }]); + expect(result.hits[0].document.title).toEqual({ nl: ['Titel'] }); + }); +}); diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 60c42f71..2cd261f2 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -5,11 +5,14 @@ import { projectDocument, projectGraph, irisOf, - type FieldSpec, - type Derivation, - type Projection, type SearchDocument, } from '../src/project.js'; +import { + searchSchema, + type SearchField, + type SearchType, + type Derivation, +} from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -30,49 +33,50 @@ const node = { [`${DR}size`]: { '@type': xsd.integer.value, '@value': '1234' }, }; -const fields: FieldSpec[] = [ +const fields: SearchField[] = [ { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, - sort: true, + output: true, + searchable: { weight: 1 }, + sortable: true, }, { name: 'publisher', path: `${DR}publisherName`, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, + output: true, + searchable: { weight: 1 }, }, { name: 'publisher', path: dcterms.publisher.value, - type: 'facet', - iri: true, + kind: 'reference', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'format', path: `${DR}format`, - type: 'facet', + kind: 'keyword', transform: (value) => value.replace(IANA, ''), }, - { name: 'class', path: `${DR}class`, type: 'facet', iri: true }, + { name: 'class', path: `${DR}class`, kind: 'reference' }, { name: 'date_posted', path: `${DR}datePosted`, - type: 'date', + kind: 'date', }, - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, ]; const derivations: Derivation[] = [ @@ -81,11 +85,11 @@ const derivations: Derivation[] = [ }, ]; -const projection: Projection = { type: DATASET, fields, derivations }; +const schema: SearchType = { type: DATASET, fields, derivations }; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { - const document = projectDocument(node, projection); + const document = projectDocument(node, schema); expect(document.id).toBe('https://ex/d/1'); expect(document.title_nl).toBe('Titel'); @@ -121,23 +125,22 @@ describe('projectDocument', () => { { type: DATASET, fields: [ - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, { name: 'language', path: dcterms.language.value, - type: 'facet', + kind: 'keyword', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'class', path: `${DR}class`, - type: 'facet', - iri: true, + kind: 'reference', }, ], }, @@ -148,6 +151,17 @@ describe('projectDocument', () => { expect(document.class).toEqual(['http://example.org/BareClass']); }); + it('projects a number field as a float (not truncated like integer)', () => { + const document = projectDocument( + { '@id': 'https://ex/d/12', [`${DR}size`]: { '@value': '1234.5' } }, + { + type: DATASET, + fields: [{ name: 'size', path: `${DR}size`, kind: 'number' }], + }, + ); + expect(document.size).toBe(1234.5); + }); + it('folds the transformed values (not the raw ones) for a facet search field', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, @@ -157,8 +171,8 @@ describe('projectDocument', () => { { name: 'format', path: `${DR}format`, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, transform: (value) => value.replace(IANA, ''), }, ], @@ -232,10 +246,11 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - // search only — display and sort not opted into. - type: 'langText', + // search only — display (output) and sort not opted into. + kind: 'text', + localized: true, locales: ['nl', 'en'], - search: true, + searchable: { weight: 1 }, }, ], }, @@ -262,6 +277,38 @@ describe('projectDocument', () => { expect(document.title_search_nl).toBe('titel ondertitel'); }); + it('skips a field with no path, leaving it to a derivation (derived field)', () => { + const document = projectDocument( + { + '@id': 'https://ex/d/11', + [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, + }, + { + type: DATASET, + fields: [ + { + name: 'title', + path: dcterms.title.value, + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + // No `path`: a derived field — its value comes from a derivation, + // never from projection. + { name: 'status', kind: 'keyword', facetable: true }, + ], + derivations: [ + (derived) => { + derived.status = 'valid'; + }, + ], + }, + ); + expect(document.title_nl).toBe('Titel'); + expect(document.status).toBe('valid'); + }); + it('throws when the framed node has no @id', () => { expect(() => projectDocument( @@ -284,7 +331,8 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: [], }, ], @@ -295,7 +343,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each projection’s type and projects matching nodes', async () => { + it('frames each root type in the schema and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . @@ -306,9 +354,10 @@ describe('projectGraph', () => { `); const documents: SearchDocument[] = []; - for await (const document of projectGraph(quads, [ - { type: DATASET, fields }, - ])) { + for await (const document of projectGraph( + quads, + searchSchema({ type: DATASET, fields }), + )) { documents.push(document); } diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts new file mode 100644 index 00000000..b82042f5 --- /dev/null +++ b/packages/search/test/query.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; +import { acceptsFilter, filterOperatorFor } from '../src/query.js'; +import type { SearchField } from '../src/schema.js'; + +const keyword: SearchField = { + name: 'format', + kind: 'keyword', + array: true, + filterable: true, +}; +const datePosted: SearchField = { + name: 'datePosted', + kind: 'date', + filterable: true, +}; +const status: SearchField = { + name: 'status', + kind: 'keyword', + facetable: true, +}; +const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + filterable: true, +}; + +describe('filterOperatorFor', () => { + it('maps each field kind to its `where` operator', () => { + expect(filterOperatorFor('text')).toBeUndefined(); + expect(filterOperatorFor('keyword')).toBe('in'); + expect(filterOperatorFor('reference')).toBe('in'); + expect(filterOperatorFor('integer')).toBe('range'); + expect(filterOperatorFor('number')).toBe('range'); + expect(filterOperatorFor('date')).toBe('range'); + expect(filterOperatorFor('boolean')).toBe('is'); + }); +}); + +describe('acceptsFilter', () => { + it('accepts a filter whose shape matches the field’s operator', () => { + expect( + acceptsFilter(keyword, { field: 'format', in: ['text/turtle'] }), + ).toBe(true); + expect( + acceptsFilter(datePosted, { + field: 'datePosted', + range: { min: '2024' }, + }), + ).toBe(true); + }); + + it('rejects a filter whose shape does not match the field’s operator', () => { + expect(acceptsFilter(keyword, { field: 'format', range: { min: 1 } })).toBe( + false, + ); + }); + + it('rejects a filter on a non-filterable field', () => { + expect(acceptsFilter(status, { field: 'status', in: ['valid'] })).toBe( + false, + ); + }); + + it('rejects any filter on a text field (it feeds the free-text query)', () => { + expect(acceptsFilter(title, { field: 'title', in: ['x'] })).toBe(false); + }); + + it('accepts an `is` filter on a filterable boolean field', () => { + const iiif: SearchField = { + name: 'iiif', + kind: 'boolean', + filterable: true, + }; + expect(acceptsFilter(iiif, { field: 'iiif', is: true })).toBe(true); + }); +}); diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts new file mode 100644 index 00000000..368821a6 --- /dev/null +++ b/packages/search/test/schema.test.ts @@ -0,0 +1,188 @@ +import { describe, expect, it } from 'vitest'; +import { + facetableFields, + filterableFields, + outputFields, + physicalFields, + searchableFields, + sortableFields, + type SearchField, + type SearchType, +} from '../src/schema.js'; + +const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; + +const schema: SearchType = { + type: DATASET, + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'datePosted', + kind: 'date', + output: true, + filterable: true, + sortable: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('physicalFields', () => { + it('fans a localized text field out into per-locale display, search and sort keys', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: ['title_nl', 'title_en'], + search: ['title_search_nl', 'title_search_en'], + sort: ['title_sort_nl', 'title_sort_en'], + }); + }); + + it('gives a searchable keyword facet one value field and one folded search field', () => { + const keyword: SearchField = { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }; + + expect(physicalFields(keyword)).toEqual({ + value: 'keyword', + display: [], + search: ['keyword_search'], + sort: [], + }); + }); + + it('emits only the search keys for a search-only localized field (no display, no sort)', () => { + const creator: SearchField = { + name: 'creator', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + searchable: { weight: 2 }, + }; + + expect(physicalFields(creator)).toEqual({ + display: [], + search: ['creator_search_nl', 'creator_search_en'], + sort: [], + }); + }); + + it('emits no per-locale fields when a localized field declares no locales', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: [], + search: [], + sort: [], + }); + }); + + it('stores a reference field in one value field', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + + expect(physicalFields(publisher)).toEqual({ + value: 'publisher', + display: [], + search: [], + sort: [], + }); + }); +}); + +describe('schema selectors', () => { + it('orders searchable fields by descending weight', () => { + expect(searchableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'keyword', + ]); + }); + + it('selects facetable, filterable, sortable and output fields by capability', () => { + expect(facetableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'status', + ]); + expect(filterableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'datePosted', + 'status', + ]); + expect(sortableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'datePosted', + ]); + expect(outputFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'datePosted', + 'status', + ]); + }); +}); diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 6a8321a2..915a945a 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.3, - branches: 88.76, - statements: 97.3, + lines: 97.84, + branches: 90.9, + statements: 97.91, }, }, }, diff --git a/tsconfig.json b/tsconfig.json index 0b6d2b2c..0defc069 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -76,6 +76,9 @@ }, { "path": "./packages/search" + }, + { + "path": "./packages/search-api-graphql" } ] }