From 2bfa22321f8ccfbf66eabcd123fddf7caa27d005 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:22:34 +0200 Subject: [PATCH 01/13] feat(search)!: unify the field model and add query IR, engine port and result types - replace FieldSpec and Projection with one SearchField/SearchSchema model - add SearchQuery, Filter, Sort and the filter-operator semantics - add the SearchEngine port and result types (SearchResult/SearchHit/ResultDocument/Reference) - add physicalFields (the shared fanout convention) and schema selectors - rewrite projectDocument and projectGraph onto the unified model; projection output unchanged - remove FieldSpec, Projection and the discriminated FieldKind (breaking) --- .../0003-search-api-core-query-model.md | 112 +++++++-- packages/search/README.md | 235 ++++++++---------- packages/search/package.json | 2 +- packages/search/src/engine.ts | 140 +++++++++++ packages/search/src/index.ts | 50 +++- packages/search/src/project.ts | 204 ++++++--------- packages/search/src/query.ts | 95 +++++++ packages/search/src/schema.ts | 184 ++++++++++++++ packages/search/test/engine.test.ts | 110 ++++++++ packages/search/test/project.test.ts | 100 +++++--- packages/search/test/query.test.ts | 78 ++++++ packages/search/test/schema.test.ts | 209 ++++++++++++++++ 12 files changed, 1195 insertions(+), 324 deletions(-) create mode 100644 packages/search/src/engine.ts create mode 100644 packages/search/src/query.ts create mode 100644 packages/search/src/schema.ts create mode 100644 packages/search/test/engine.test.ts create mode 100644 packages/search/test/query.test.ts create mode 100644 packages/search/test/schema.test.ts diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 8189cda5..57521fad 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -32,7 +32,7 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | Tier | Package | Responsibility | | ----------- | ------------------------- | ----------------------------------------------------------------------------------------------------------------------- | -| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · adapter port | +| backend | `@lde/search` | field model · `SearchQuery` · filter semantics · engine port | | backend | `@lde/search-typesense` | engine adapter: collection schema · query/filter compiler · `search()` | | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | @@ -46,12 +46,22 @@ The **API contract** (the SDL shape consumers couple to) is breaking to change a right in v1. The **IR / stored document** (framed JSON-LD vs a flat engine doc) lives behind the adapter and is swappable with no consumer impact. Nothing engine-specific (companion fields, `int32`, the engine query language) and nothing RDF-specific -(`@context`, `@id`, IRI-keyed predicates) leaks past the adapter port. +(`@context`, `@id`, IRI-keyed predicates) leaks past the engine port. ### Field model The engine-neutral description of a queryable field – the runtime form of one SHACL -NodeShape + its `search:` annotations: +NodeShape + its `search:` annotations. **One `SearchField` declaration drives four +consumers** – projection (RDF→flat document), the engine collection schema, the query +semantics, and the GraphQL surface – so they cannot drift. + +> Updated 2026-06-26 (during implementation): this is the **unified** field model. It +> folds the three previously separate declarations into one – the projection-side +> `FieldSpec`/`FieldKind` (RDF→doc), the deployment’s Typesense `SEARCH_FIELDS` (collection +> schema + weights), and the query model below. The original ADR deferred this unification; +> it is now adopted (option “c”). The `kind` + capability flags replace the old discriminated +> projection kinds, derived fields become first-class, and the Typesense-vocabulary types are +> _derived_ from `kind` rather than re-declared. ```ts type FieldKind = @@ -64,31 +74,44 @@ type FieldKind = | 'reference'; interface SearchField { - readonly name: string; // logical API name + readonly name: string; // logical API name; the physical fanout derives from it readonly kind: FieldKind; - readonly array?: boolean; - readonly localized?: boolean; + readonly path?: string; // sh:path to project from; omit for a derivation-populated field + readonly array?: boolean; // sh:maxCount + readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) + readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type - readonly searchable?: { weight: number }; // free-text inclusion + weight + readonly searchable?: { weight: number }; // free-text inclusion + weight (per-locale when localized) readonly filterable?: boolean; // usable in `where` readonly facetable?: boolean; readonly sortable?: boolean; - readonly nestedStrategy?: 'labelOnly' | 'idOnly' | 'inline'; // for `reference` + readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' + readonly transform?: (value: string) => string; // projection-time value transform readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta } +type Derivation = (document: SearchDocument, node: FramedNode) => void; + interface SearchSchema { + readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; // computed fields: status, *_group, booleans } ``` -Maps onto SHACL + `search:` (`kind`←`sh:datatype`, `array`←`sh:maxCount`, -`localized`←`sh:languageIn`, `facetable`←`search:facetable`, `sortable`←`search:sortable`, -`nestedStrategy`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an eventual generator -emits it unchanged. The `group` companion (coarse grouped facets, e.g. `format_group`) and -the `status_rank` tie-break sort are **deployment-specific deltas**, never in `@lde/search`. -`relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a -generic reserved sort the adapter understands. +Maps onto SHACL + `search:` (`kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, +`array`←`sh:maxCount`, `localized`←`sh:languageIn`, `facetable`←`search:facetable`, +`sortable`←`search:sortable`, `ref`←`sh:node`/`sh:class` + `search:nestedStrategy`) so an +eventual generator emits it unchanged. A field with **no `path`** is a derived field – +populated by a `Derivation` rather than projected from the IR – yet it still carries full +query/schema/output behavior, which is how the former separate projection `FieldSpec` is +subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, +`${name}_sort_${locale}`, `${name}_search`, `${name}_group`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `group` +companion (coarse grouped facets, e.g. `format_group`) and the `status_rank` tie-break sort +are **deployment-specific deltas**, never in `@lde/search`. `relevance` is _not_ a delta: +every full-text engine ranks by match score, so it is a generic reserved sort the adapter +understands. ### `SearchQuery` – the neutral query IR @@ -150,22 +173,61 @@ matches Typesense’s native inclusive range, covers every DR case, additively r Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` strings and the adapter splits/unions them. -### Adapter port and result +### Engine port and result + +The **port** is the interface the core defines; a concrete engine **adapter** +(`@lde/search-typesense`’s `TypesenseSearchEngine`) implements it. Naming the port for the +capability (`SearchEngine`), not the pattern piece, keeps `TypesenseSearchEngine implements +SearchEngine` readable. ```ts -interface SearchAdapter { - search(query: SearchQuery, schema: SearchSchema): Promise; +// FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them +// to its schema’s facetable / output field names for typo-safe facet and document access +// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + schema: SearchSchema, + ): Promise>; } -interface SearchResult { - readonly hits: readonly { id: string; document: SearchDocument }[]; +interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; readonly total: number; + // Keyed by facet field name; `Partial` because only the queried facets are present. + // A bucket’s `label` (a LocalizedValue) is the engine-resolved canonical data label, + // present only for reference (IRI-keyed) facets; absent for token/free-string facets, + // whose display the consumer owns (its own i18n, or the value itself). readonly facets: Readonly< - Record + Partial< + Record< + FacetField, + readonly { value: string; count: number; label?: LocalizedValue }[] + > + > >; } -type SearchDocument = Record; +// `id` (the stable document key, an IRI) stays out of the document: it is the hit’s +// identity, always present, a different contract from the optional logical field values, +// and maps straight onto the GraphQL output’s `id: String!`. +interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +// The logical result document. Named distinctly from the flat, fanned-out projection +// `SearchDocument` that lives index-side: this carries logical fields (language maps, +// references) ready for a surface to shape. +type ResultDocument = Readonly< + Partial> +>; type SearchValue = | string | number @@ -247,7 +309,9 @@ not enabled for DR v1, more relevant for B/C. - Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the `@lde/search*` naming and a core package row. +- Adopted during implementation (2026-06-26): the **unified** field model – the projection + `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this + one `SearchField` (see the Field model note above). - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) - search; unifying the projection `FieldSpec` (RDF→doc) with this `SearchField` - (query/output) into one field declaration. + search. diff --git a/packages/search/README.md b/packages/search/README.md index 5672881e..476170d9 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -1,170 +1,155 @@ # @lde/search -Engine-agnostic search projection for RDF-backed pipelines. **`projectGraph`** -streams the result of a SPARQL `CONSTRUCT` into flat search documents, with no -engine and no vocabulary baked in. Internally it does two things per subject of -a root type: frame its one-hop subgraph into a JSON-LD IR node, then project -that node into a flat document from a **declarative field spec**. +The **engine- and domain-agnostic core** for RDF-backed search. It bakes in no +search engine, no API protocol, and no domain vocabulary: you supply a +declarative `SearchSchema`, and engine adapters and API surfaces sit on the ports +defined here. The library never names your domain — the same core drives a +`Dataset`, `Person`, or `CreativeWork` search. + +It provides four things: + +- **the unified field model** — `SearchField` / `SearchSchema`: one declaration + per field that drives all four consumers below, so they cannot drift; +- **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter + semantics, the shared compiler target every API surface parses into; +- **the engine port** — `SearchEngine` and the logical result types + (`SearchResult` / `SearchHit` / `ResultDocument` / `Reference` / …); +- **a streaming projection** — `projectGraph`, RDF `CONSTRUCT` quads → flat + search documents. -An engine adapter (e.g. [`@lde/search-typesense`](../search-typesense)) then -writes those documents to a search backend. - -```ts -import { projectGraph, type Projection } from '@lde/search'; - -const projection: Projection = { - /* type + field spec — see below */ -}; - -for await (const document of projectGraph(quads, [projection])) { - // one flat search document per matching subject, streamed -} +``` +SearchSchema ─┬─► projection (projectGraph → flat documents) [here] + ├─► engine adapter (collection schema + query compiler) e.g. @lde/search-typesense + ├─► query semantics (SearchQuery, filter/sort/facet) [here] + └─► API surface (GraphQL / REST) e.g. @lde/search-api-graphql ``` -`projectGraph` is fully streaming: subjects are grouped and framed one at a time -and documents are yielded as they are produced, so beyond a subject index memory -stays flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate -triples are collapsed first, because some SPARQL engines (e.g. QLever) do not -deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` -reading it sees full predicate IRIs with language tags preserved. +One field, four consumers — that is why the model is unified: a field’s `kind` +plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / +`output`) describe projection, the engine collection schema, the query semantics, +and the API output in a single place. -## Projection +## Field model -The mapping is data, not code. Each field declares the IR `path` to read and a -`kind`; the conventions (per-locale split, diacritic folding via -[`@lde/text-normalization`](../text-normalization), facet arrays, numeric -coercion) are applied for you. Computed fields are `derivations` — hooks that -read the node and set fields the kinds can't. +The mapping is data, not code. Each field declares its `kind`, the IR `path` to +read (omit it for a **derived** field, populated by a `derivation`), and the +capabilities it opts into. The physical field names a declaration fans out to +(per-locale search/sort keys, the grouped-facet companion) come from +`physicalFields`, the single convention projection, the collection schema and the +query compiler all share. ```ts -import { projectGraph, irisOf, type Projection } from '@lde/search'; +import { projectGraph, irisOf, type SearchSchema } from '@lde/search'; -const projection: Projection = { +const DATASET = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ - // → title_nl, title_en, title_search_nl, title_search_en, title_sort_nl, title_sort_en + // → title_nl, title_en, title_search_nl/_en, title_sort_nl/_en { name: 'title', path: 'http://purl.org/dc/terms/title', - kind: { - type: 'langText', - locales: ['nl', 'en'], - display: true, - search: true, - sort: true, - }, + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, }, - // → publisher (IRI facet) + // → publisher (IRI facet, resolved to a labelled reference at the surface) { name: 'publisher', path: 'http://purl.org/dc/terms/publisher', - kind: { type: 'facet', iri: true }, + kind: 'reference', + facetable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, }, // → size (int) - { name: 'size', path: 'urn:dr:size', kind: { type: 'number' } }, + { name: 'size', path: 'urn:dr:size', kind: 'integer', sortable: true }, + // derived field (no path): populated by the derivation below + { name: 'classCount', kind: 'integer', sortable: true }, ], derivations: [ (document, node) => { - document.class_count = irisOf(node, 'urn:dr:class').length; + document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -}; +} as const satisfies SearchSchema; -for await (const document of projectGraph(quads, [projection])) { - // … +for await (const document of projectGraph(quads, [DATASET])) { + // one flat search document per matching subject, streamed } ``` -**Kinds** +Capturing the schema with `as const satisfies SearchSchema` keeps the field +literals, so the API surface can derive typed facet/output keys from it (see +`@lde/search-api-graphql`). + +**Kinds** (`FieldKind`): `text`, `keyword`, `integer`, `number`, `boolean`, +`date`, `reference`. The Typesense/engine vocabulary and the GraphQL types are +_derived_ from the kind by the adapter and the surface — never declared here. -| kind | emits | -| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `langText` | per locale (see below), each opt-in: `_${locale}` display with `display`, `_search_${locale}` folded with `search`, `_sort_${locale}` folded with `sort` | -| `facet` | the field as a deduped array; `iri` reads `@id`; `search` adds a folded `_search`; `transform` rewrites values | -| `number` | a numeric scalar; `date` parses an ISO date-time to unix seconds | +| kind | `where` | facet | sort | output | +| -------------------- | -------------------- | ----- | ---------------- | ------------------------------- | +| `text` (`localized`) | – (feeds free text) | – | yes (per-locale) | best-first language list | +| `keyword` | `in` (membership) | yes | – | string / `string[]` | +| `reference` | `in` (membership) | yes | – | labelled reference (id + label) | +| `integer` / `number` | `range { min, max }` | yes | yes | number | +| `date` | `range` (inclusive) | yes | yes | ISO 8601 string (surface) | +| `boolean` | `is` | yes | – | boolean (absent = false) | + +## Projection + +`projectGraph` is fully streaming: subjects are grouped and framed one at a time +and documents are yielded as produced, so beyond a subject index memory stays +flat at scale (framing the whole graph at once is roughly O(N²)). Duplicate +triples are collapsed first, because some SPARQL engines (e.g. QLever) do not +deduplicate `CONSTRUCT` output. The IR carries no `@context`, so a `derivation` +reading it sees full predicate IRIs with language tags preserved. ## Locales -`locales` is the **single** list of languages a `langText` field projects; -`display`, `search` and `sort` are independent opt-in families that each fan out +`locales` is the **single** list of languages a localized `text` field projects; +`output`, `searchable` and `sortable` are independent opt-ins that each fan out over it (so a field emits exactly what it opts into): -- `display` → `title_nl`/`title_en` (accents preserved); -- `search` → `title_search_nl`/`title_search_en` (folded; one field per locale - lets a query `query_by` them and rank the user’s language higher via - `query_by_weights`, and lets a language that needs a dedicated tokenizer set - its own `locale` in the schema); -- `sort` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI +- `output` → `title_nl`/`title_en` (accents preserved); +- `searchable` → `title_search_nl`/`title_search_en` (folded; one field per locale + lets a query `query_by` them and rank the user’s language higher, and lets a + language that needs a dedicated tokenizer set its own stemming `locale` in the + engine schema); +- `sortable` → `title_sort_nl`/`title_sort_en` (folded, so a locale-switching UI sorts on the active language). -A field with `search` but no `display` is **search-only** — folded and stemmed -for retrieval but never rendered (e.g. a `publisher` searched here but shown via -a separate label). +A field with `searchable` but no `output` is **search-only** — folded and stemmed +for retrieval but never rendered (e.g. a creator searched here but shown via a +separate label). **Only listed locales are indexed**; a literal whose language tag +is not in `locales` (or has no tag) is not projected at all. Per-locale fields are +**omitted, never empty**, when a document lacks that language, so declare them +optional in the engine schema and sort with `missing_values: last`. Folding the search fields is what lets diacritic-insensitive matching and stemming coexist. A search engine on its **default** locale typically folds case -and diacritics for you (Typesense v30, verified, even folds ø/æ/ß) — so there the -folding here is belt-and-suspenders. But enabling a language’s **stemming** -requires setting that language’s `locale` (e.g. `locale: 'nl'` + `stem: true` so -`huizen` matches `huis`), and a non-default locale switches the engine to ICU -tokenization, which **preserves** diacritics. At that point the engine no longer -folds them, and `fold()` is what keeps matching diacritic-insensitive. Stemming -is a per-field engine-schema choice (the consumer’s), and being rules-based it -can mangle proper nouns and place names — e.g. the Dutch stemmer reduces the city -`Bergen` to `berg`, colliding it with “mountain”. - -Recommended split: enable stemming on the **free-text** search fields -(`*_search_${locale}`, descriptions, keywords) where morphological recall helps -(`verhaal` ↔ `verhalen`), and keep **place names and other proper-noun facets on -a separate, unstemmed field** (facets are exact-match anyway). That captures the -recall without the `Bergen`/`berg` collision in the facet. A `stem_dictionary` -can pin specific names if you need stemmed free-text without given collisions. - -**Only listed locales are indexed.** A literal whose language tag is not in -`locales` is not projected at all — no display, no search, no sort field — so it -is invisible to the index. To index a language, add it to `locales`. - -Per-locale fields are **omitted, never empty**, when a document lacks that -language, so declare them `optional: true` in the engine schema. At query time, -sort with `missing_values: last` to push documents lacking the active locale to -the end, and `query_by` all the per-locale search fields (weighting the user’s -locale higher) to keep cross-language recall. - -A literal with no `@language` tag matches no locale, so it is not projected. Tag -your source literals (or pre-process them) for the languages you index. +and diacritics for you; enabling a language’s **stemming** switches it to ICU +tokenization, which **preserves** diacritics — at which point `fold()` (from +[`@lde/text-normalization`](../text-normalization)) is what keeps matching +diacritic-insensitive. Stemming is rules-based and can mangle proper nouns (the +Dutch stemmer reduces the city `Bergen` to `berg`), so enable it on free-text +fields and keep proper-noun facets on a separate, unstemmed field. ## Querying The search fields are stored already case- and diacritic-folded, so **the query -must be folded the same way** with the same `fold()` from -[`@lde/text-normalization`](../text-normalization) before it reaches the engine. -Otherwise index and query are normalized differently and matches silently miss -(the user sees no results, with no error). An engine on its default locale would -fold a raw query for you, but one set to a stemming locale (which preserves -diacritics) or a non-folding backend will not — so always fold, and matching -stays correct on any engine. - -```ts -import { fold } from '@lde/text-normalization'; - -await client - .collections(collection) - .documents() - .search({ - q: fold(userQuery), - query_by: 'title_search_nl,title_search_en', - query_by_weights: '2,1', // rank the user’s locale higher - }); -``` - -This contract holds for **any** consumer, including a search API built on top of -this package: index-time and query-time folding must use the same `fold()`, or -non-decomposing terms silently miss. - -## Why a spec - -The field spec's vocabulary mirrors SHACL on purpose: `path` is `sh:path`, and -the kind is derivable from `sh:datatype` / `sh:nodeKind` / `sh:maxCount` plus -search annotations. So the same projection engine that runs a hand-written spec -today will run a **SHACL-generated** spec tomorrow — the engine and the IR stay; -only spec-authoring gets automated. Nothing is thrown away. +must be folded the same way** with the same `fold()` before it reaches the engine, +or index and query normalize differently and matches silently miss. This contract +holds for **any** consumer, including an API built on this package — which is why +engine adapters and surfaces compile through the shared `SearchQuery` IR and the +`physicalFields` convention rather than re-deriving field names. + +## Why a declarative model + +The vocabulary mirrors SHACL on purpose: `path` is `sh:path`, `array` is +`sh:maxCount`, `required` is `sh:minCount`, `localized` is `sh:languageIn`, `ref` +is `sh:class`/`sh:node`. So the same core that runs a hand-written `SearchSchema` +today will run a **SHACL-generated** one tomorrow — the model, the ports and the +IR stay; only schema-authoring gets automated. diff --git a/packages/search/package.json b/packages/search/package.json index 61657f95..e81f647f 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine-agnostic search projection for RDF-backed pipelines: frame CONSTRUCT quads into a JSON-LD IR, then project that IR into flat search documents from a declarative field spec (the artifact a SHACL generator would emit)", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts new file mode 100644 index 00000000..59284d7f --- /dev/null +++ b/packages/search/src/engine.ts @@ -0,0 +1,140 @@ +import type { SearchQuery } from './query.js'; +import type { SearchSchema } from './schema.js'; + +/** + * The engine port — the boundary a concrete engine adapter (e.g. + * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter + * owns every engine specific (companion-field expansion, `query_by`/weights, the + * filter compiler, `sort_by`, folding, `facet_by`) and returns only logical + * documents, so a deployment can swap engines without any consumer noticing. + * Nothing engine-specific and nothing RDF-specific leaks past this port. + * + * `FacetField` keys the returned facet map; it defaults to `string` so an engine + * stays ergonomic, and a deployment can narrow it to its own facet-field union + * (see {@link FacetFieldsOf}) for typo-safe facet access. + */ +export interface SearchEngine< + FacetField extends string = string, + OutputField extends string = string, +> { + search( + query: SearchQuery, + schema: SearchSchema, + ): Promise>; +} + +/** What an engine returns: logical hits, a total, and the requested facets. */ +export interface SearchResult< + FacetField extends string = string, + OutputField extends string = string, +> { + readonly hits: readonly SearchHit[]; + readonly total: number; + readonly facets: FacetMap; +} + +/** + * Facet buckets keyed by facet field name. `Partial` because a result carries + * buckets only for the fields the query asked for, not every facetable field. + */ +export type FacetMap = Readonly< + Partial> +>; + +/** + * The facet-field-name union of a schema — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the schema be captured as a literal type + * (`as const satisfies SearchSchema`), so the `facetable: true` flags survive as + * literals; a plain `: SearchSchema` annotation widens them and yields `never`. + */ +export type FacetFieldsOf = Extract< + Schema['fields'][number], + { readonly facetable: true } +>['name']; + +/** + * The output-field-name union of a schema — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the schema captured as a literal + * (`as const satisfies SearchSchema`). + */ +export type OutputFieldsOf = Extract< + Schema['fields'][number], + { readonly output: true } +>['name']; + +/** A {@link SearchEngine} narrowed to one schema: facet keys and document keys + * fixed to that schema’s facetable / output field names. The schema must be + * captured as `as const satisfies SearchSchema`. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf +>; + +/** A {@link SearchResult} narrowed to one schema (see {@link EngineFor}). */ +export type ResultFor = SearchResult< + FacetFieldsOf, + OutputFieldsOf +>; + +/** + * One result row. `id` (the stable document key, an IRI) is kept *out* of + * {@link ResultDocument}: it is always present and is the hit’s identity, a + * different contract from the optional, typed logical field values — and it maps + * straight onto the GraphQL output’s guaranteed `id: String!`. The document + * holds only the selectable fields. + */ +export interface SearchHit { + readonly id: string; + readonly document: ResultDocument; +} + +/** + * The logical result document at the query seam — engine- and RDF-neutral. + * Distinct from the flat, fanned-out projection `SearchDocument` that lives + * index-side: this carries logical fields with language maps and references, + * ready for a surface to shape. Keyed by output field name; `Partial` because a + * document omits absent optional fields. `OutputField` defaults to `string`; a + * deployment narrows it via {@link OutputFieldsOf} for typo-safe field access. + */ +export type ResultDocument = Readonly< + Partial> +>; + +/** A logical field value. */ +export type SearchValue = + | string + | number + | boolean + | readonly string[] + | LocalizedValue + | Reference + | readonly Reference[]; + +/** + * A JSON-LD-style language map (`@container: @language`, `@set` arrays); the key + * `und` carries untagged (`@none`) values. The surface flattens it to a + * best-first `Accept-Language`-ordered list. + */ +export type LocalizedValue = Readonly>; + +/** + * The generic internal carrier for a referenced entity. The GraphQL surface maps + * it to a named per-shape type (e.g. `Organization`, `Term`) with `label` + * exposed as `name`. + */ +export interface Reference { + readonly id: string; + readonly label?: LocalizedValue; +} + +/** + * One facet bucket: a value and how many documents carry it. `label` is the + * engine-resolved canonical **data** label, present only for reference facets + * (IRI-keyed); it is absent for facets whose value is a token or free string + * whose display the consumer owns (its own i18n, or the value itself). + */ +export interface FacetBucket { + readonly value: string; + readonly count: number; + readonly label?: LocalizedValue; +} diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 10c2b32f..cb02290e 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,47 @@ +// Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified +// SearchField/SearchSchema model below (one declaration; the fanout names come +// from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; +export type { SearchDocument } from './project.js'; + +// Unified field model: one declaration drives projection, engine collection +// schema, query semantics and the GraphQL surface. Plus the schema selectors and +// the physical field-name convention they all share. +export { + physicalFields, + searchableFields, + facetableFields, + filterableFields, + sortableFields, + outputFields, +} from './schema.js'; export type { - SearchDocument, - Projection, - FieldSpec, FieldKind, - LangTextKind, - FacetKind, - NumberKind, - DateKind, + SearchField, + SearchSchema, Derivation, -} from './project.js'; + PhysicalFields, +} from './schema.js'; + +// Engine- and protocol-neutral query IR + filter semantics. +export { filterOperatorFor, filterOperator, acceptsFilter } from './query.js'; +export type { SearchQuery, Filter, Sort, FilterOperator } from './query.js'; + +// Engine port + the logical result document returned across it. +export type { + SearchEngine, + SearchResult, + SearchHit, + ResultDocument, + SearchValue, + LocalizedValue, + Reference, + FacetBucket, + FacetMap, + FacetFieldsOf, + OutputFieldsOf, + EngineFor, + ResultFor, +} from './engine.js'; + export type { FramedNode } from './frame-by-type.js'; diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index c181978f..284c3183 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -1,135 +1,56 @@ import type { Quad } from '@rdfjs/types'; import { fold } from '@lde/text-normalization'; import { frameByType, type FramedNode } from './frame-by-type.js'; +import { + physicalFields, + type SearchField, + type SearchSchema, +} from './schema.js'; /** A flat search document. `id` is the engine document key. */ export type SearchDocument = { id: string } & Record; -/** - * How one framed-IR property projects into search fields. The vocabulary mirrors - * SHACL so a generator can later emit it from shapes + search annotations: - * `path` is `sh:path`, and the kind is derivable from `sh:datatype`/`sh:nodeKind` - * /`sh:maxCount` plus the search annotations. - */ -export type FieldKind = LangTextKind | FacetKind | NumberKind | DateKind; - -/** - * Language-tagged text, projected per locale. `locales` is the single source of - * truth for which languages this field emits; `display`, `search` and `sort` are - * three independent opt-in families that each fan out over it: - * - `display` → `${name}_${locale}` display label, accents preserved; - * - `search` → `${name}_search_${locale}` folded match field (one per locale so - * the engine can tokenize/stem each language and the query can rank the user’s - * locale higher); - * - `sort` → `${name}_sort_${locale}` folded sort key (one per locale so a - * locale-switching UI sorts on the active language). - * - * All three default off — a field emits exactly the families it opts into (e.g. - * `search` alone is a search-only field, shown via a separate label). Only listed - * locales are projected: a value whose language tag is not in `locales` (and is - * not mapped in by `untaggedLanguage`) is not indexed at all. - */ -export interface LangTextKind { - readonly type: 'langText'; - /** The languages to project; drives whichever of the families are enabled. */ - readonly locales: readonly string[]; - /** Emit the per-locale display labels `${name}_${locale}` (accents preserved). */ - readonly display?: boolean; - /** Emit a folded `${name}_search_${locale}` per locale (matchable). */ - readonly search?: boolean; - /** Emit a folded `${name}_sort_${locale}` per locale (sortable). */ - readonly sort?: boolean; -} - -/** A faceted multi-value field, optionally also folded for search. */ -export interface FacetKind { - readonly type: 'facet'; - /** Read IRI references (`@id`) rather than literal values. */ - readonly iri?: boolean; - /** Also emit a folded `${name}_search` array. */ - readonly search?: boolean; - /** Transform each value before faceting (e.g. strip a media-type prefix). */ - readonly transform?: (value: string) => string; -} - -/** A numeric scalar. */ -export interface NumberKind { - readonly type: 'number'; -} - -/** An ISO date-time, parsed into Unix seconds. */ -export interface DateKind { - readonly type: 'date'; -} - -/** - * One field of a projection: an output `name`, the framed-IR predicate `path` to - * read (the SHACL `sh:path`), and the kind-specific config discriminated by - * `type`. - */ -export type FieldSpec = { - /** Output field base name; per-kind suffixes are appended. */ - readonly name: string; - /** Framed-IR predicate IRI to read (the SHACL `sh:path`). */ - readonly path: string; -} & FieldKind; - -/** A computed field that is not a direct projection of a single path - * (e.g. a status rank, or a group derived from a code table). */ -export type Derivation = (document: SearchDocument, node: FramedNode) => void; - -/** - * One root type’s complete projection — the runtime form of a single SHACL - * NodeShape: `type` is its `sh:targetClass` (and the framed node’s `@type`), - * `fields` are its property shapes, and `derivations` are its `sh:rule`-shaped - * computed fields. A generator emits one of these per NodeShape. - */ -export interface Projection { - readonly type: string; - readonly fields: readonly FieldSpec[]; - readonly derivations?: readonly Derivation[]; -} - /** * Project one framed JSON-LD node into a flat search document: apply each field - * spec, then run the derivations (which may read fields the specs already set). + * of the schema, then run the derivations (which may read fields the field specs + * already set). The physical field names a field fans out to come from + * {@link physicalFields}, the single source shared with the engine collection + * schema and the query compiler. */ export function projectDocument( node: FramedNode, - projection: Projection, + schema: SearchSchema, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${projection.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${schema.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of projection.fields) { + for (const field of schema.fields) { applyField(document, node, field); } - for (const derive of projection.derivations ?? []) { + for (const derive of schema.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every projection’s root type and project each node with its - * type’s projection — the multi-shape pipeline. Streams one document at a time - * so memory stays flat. The IR maps to a projection by type, so adding a shape - * is adding a `Projection` (no engine change). + * Frame `quads` for every schema’s root type and project each node with its + * type’s schema — the multi-shape pipeline. Streams one document at a time so + * memory stays flat. The IR maps to a schema by type, so adding a shape is + * adding a `SearchSchema` (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - projections: readonly Projection[], + schemas: readonly SearchSchema[], ): AsyncIterable { - const byType = new Map( - projections.map((projection) => [projection.type, projection]), - ); - for (const projection of byType.values()) { - for await (const node of frameByType(quads, projection.type)) { - yield projectDocument(node, projection); + const byType = new Map(schemas.map((schema) => [schema.type, schema])); + for (const schema of byType.values()) { + for await (const node of frameByType(quads, schema.type)) { + yield projectDocument(node, schema); } } } @@ -137,77 +58,96 @@ export async function* projectGraph( function applyField( document: SearchDocument, node: FramedNode, - field: FieldSpec, + field: SearchField, ): void { - switch (field.type) { - case 'langText': - return applyLangText(document, langValuesOf(node, field.path), field); - case 'facet': - return applyFacet(document, node, field); - case 'number': + const path = field.path; + if (path === undefined) { + // A derived field — populated by a derivation, not projected from a path. + return; + } + switch (field.kind) { + case 'text': + return applyLocalizedText(document, langValuesOf(node, path), field); + case 'keyword': + return applyFacet(document, literalsOf(node, path), field); + case 'reference': + return applyFacet(document, irisOf(node, path), field); + case 'integer': return setNumber( document, field.name, - toInteger(firstLiteralOf(node, field.path)), + toInteger(firstLiteralOf(node, path)), ); case 'date': return setNumber( document, field.name, - isoToUnix(firstLiteralOf(node, field.path)), + isoToUnix(firstLiteralOf(node, path)), ); } + // `number` and `boolean` are not projected from a path in current schemas + // (booleans are derivation-populated, e.g. the compatibility vinkjes). } -function applyLangText( +/** + * Project a language-tagged text field per locale. Display shows one label + * (accents preserved) when the field is `output`; sort keys off that same + * primary value (folded) when `sortable`; search folds every value of the locale + * when `searchable`, so all are matchable. Absent locales emit nothing. + */ +function applyLocalizedText( document: SearchDocument, values: readonly LangValue[], - { name, locales, display, search, sort }: Extract, + field: SearchField, ): void { + const locales = field.locales ?? []; if (locales.length === 0) { throw new Error( - `langText field “${name}” must declare at least one locale; nothing would be projected otherwise.`, + `Localized text field “${field.name}” must declare at least one locale; nothing would be projected otherwise.`, ); } - for (const locale of locales) { + const names = physicalFields(field); + locales.forEach((locale, index) => { const localeValues = values .filter((value) => value.lang === locale) .map((value) => value.value); if (localeValues.length === 0) { - continue; + return; } - // Display shows one label (accents preserved); sort keys off that same - // primary value (folded); search folds every value of the locale so all - // are matchable. Absent locales emit nothing (the field stays optional). const [primary] = localeValues; - if (display) { - setString(document, `${name}_${locale}`, primary); + if (field.output) { + setString(document, names.display[index], primary); } - if (search) { + if (field.searchable) { setString( document, - `${name}_search_${locale}`, + names.search[index], fold(localeValues.join(' ')).trim(), ); } - if (sort) { - setString(document, `${name}_sort_${locale}`, fold(primary)); + if (field.sortable) { + setString(document, names.sort[index], fold(primary)); } - } + }); } +/** + * Project a faceted multi-value field: dedupe (after the optional transform), + * write the value field, and — when `searchable` — a folded `${name}_search` + * array. `keyword` reads literals; `reference` reads IRIs (the caller passes the + * already-read raw values). + */ function applyFacet( document: SearchDocument, - node: FramedNode, - { name, path, iri, search, transform }: Extract, + raw: readonly string[], + field: SearchField, ): void { - const raw = iri ? irisOf(node, path) : literalsOf(node, path); - const values = dedupe(transform ? raw.map(transform) : raw); - setArray(document, name, values); - if (search) { + const values = dedupe(field.transform ? raw.map(field.transform) : raw); + setArray(document, field.name, values); + if (field.searchable) { setArray( document, - `${name}_search`, + physicalFields(field).search[0], dedupe(values.map((value) => fold(value))), ); } diff --git a/packages/search/src/query.ts b/packages/search/src/query.ts new file mode 100644 index 00000000..d009ea75 --- /dev/null +++ b/packages/search/src/query.ts @@ -0,0 +1,95 @@ +import type { FieldKind, SearchField } from './schema.js'; + +/** + * The engine- and protocol-neutral query IR. Every API surface parses its input + * into this; the engine adapter consumes it. It is the shared compiler target + * that keeps the GraphQL surface, a later REST surface and the adapter from + * drifting. + */ +export interface SearchQuery { + /** Free-text query; `undefined`/`''` means browse (no text ranking). */ + readonly text?: string; + /** AND across fields. */ + readonly where: readonly Filter[]; + /** Primary public sort plus any server tie-breaks, in precedence order. */ + readonly orderBy: readonly Sort[]; + /** Numbered pagination. */ + readonly limit: number; + readonly offset: number; + /** Logical field names to return facet buckets for. */ + readonly facets: readonly string[]; + /** Selects the per-locale fields to query/sort on (from `Accept-Language`). */ + readonly locale: string; +} + +/** + * One `where` clause. The operator is fixed by the target field’s {@link FieldKind} + * ({@link filterOperatorFor}): keyword/reference use `in` (OR within the field), + * the numeric/date kinds use an inclusive `range`, boolean uses `is`. Bounds are + * inclusive only — no `gt`/`gte`/`lt`/`lte`. + */ +export type Filter = + | { readonly field: string; readonly in: readonly string[] } + | { + readonly field: string; + readonly range: { + readonly min?: number | string; + readonly max?: number | string; + }; + } + | { readonly field: string; readonly is: boolean }; + +/** A single sort dimension. */ +export interface Sort { + readonly field: string; + readonly direction: 'asc' | 'desc'; +} + +/** The `where` operator a kind accepts, or `undefined` when it is not filterable + * through `where` (`text` feeds the free-text `query` instead). */ +export type FilterOperator = 'in' | 'range' | 'is'; + +const OPERATOR_BY_KIND: Readonly< + Record +> = { + text: undefined, + keyword: 'in', + reference: 'in', + integer: 'range', + number: 'range', + date: 'range', + boolean: 'is', +}; + +/** + * The `where` operator a field of this kind accepts (per the ADR filter-semantics + * table), or `undefined` for `text` — which feeds the free-text `query` rather + * than `where`. Drives both the surface’s `where` input type and the adapter’s + * filter compiler from one rule. + */ +export function filterOperatorFor(kind: FieldKind): FilterOperator | undefined { + return OPERATOR_BY_KIND[kind]; +} + +/** The operator a concrete {@link Filter} carries, from its shape. */ +export function filterOperator(filter: Filter): FilterOperator { + if ('in' in filter) { + return 'in'; + } + if ('range' in filter) { + return 'range'; + } + return 'is'; +} + +/** + * Whether `field` can be filtered by `filter`: the field must be `filterable` + * and the filter’s shape must be the operator its kind accepts. Surfaces use it + * to reject malformed `where` input before it reaches the adapter. + */ +export function acceptsFilter(field: SearchField, filter: Filter): boolean { + return ( + field.filterable === true && + filterOperator(filter) === filterOperatorFor(field.kind) + ); +} diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts new file mode 100644 index 00000000..2873d99c --- /dev/null +++ b/packages/search/src/schema.ts @@ -0,0 +1,184 @@ +import type { FramedNode } from './frame-by-type.js'; +import type { SearchDocument } from './project.js'; + +/** + * The engine-neutral kind of a queryable field — the runtime form of one SHACL + * property shape’s datatype/nodeKind. It drives every downstream behavior: + * which physical fields the projection emits, the engine collection-schema + * type, the `where`/facet/sort semantics, and the GraphQL output/input type. + * The Typesense-vocabulary types (`string`, `int32`, …) are *derived* from this + * by the engine adapter, never declared here. + */ +export type FieldKind = + | 'text' + | 'keyword' + | 'integer' + | 'number' + | 'boolean' + | 'date' + | 'reference'; + +/** + * One queryable field — the single declarative source that drives all four + * consumers (projection, engine collection schema, query semantics, and the + * GraphQL surface). The vocabulary mirrors SHACL + the `search:` annotations so + * a generator can later emit it unchanged from shapes: + * `kind`←`sh:datatype`/`sh:nodeKind`, `path`←`sh:path`, `array`←`sh:maxCount`, + * `localized`←`rdf:langString`/`sh:languageIn`, `ref`←`sh:node`/`sh:class`. + * + * Capability flags (`searchable`/`filterable`/`facetable`/`sortable`/`output`) + * are independent opt-ins: a field exposes exactly the roles it declares. A + * field with no `path` is a **derived field** — populated by a + * {@link Derivation} rather than projected from the IR — yet it still carries + * full query/schema/output behavior (e.g. `status`, the `*_group` companions, + * the compatibility booleans). + * + * The physical field names a declaration fans out to (per-locale search/sort + * keys, the grouped-facet companion, …) follow one convention, owned by + * {@link physicalFields} so projection, collection-schema and query compiler + * cannot disagree. + */ +export interface SearchField { + /** Logical API name; the physical fanout derives from it. Declare camelCase + * where it surfaces in GraphQL. */ + readonly name: string; + readonly kind: FieldKind; + /** Framed-IR predicate IRI to project from (the SHACL `sh:path`). Omit for a + * derivation-populated field. */ + readonly path?: string; + /** Multi-valued (`sh:maxCount > 1`). */ + readonly array?: boolean; + /** Always present (`sh:minCount ≥ 1`): a non-null scalar in the API output and + * a non-optional field in the engine index. Moot for arrays/booleans/`id`, + * which are non-null regardless. */ + readonly required?: boolean; + /** Language-tagged text (`rdf:langString`); projected per locale. `text` only. */ + readonly localized?: boolean; + /** When `localized`, the languages to emit (the per-locale fanout). */ + readonly locales?: readonly string[]; + /** Appears in the API output type / carries a display label. */ + readonly output?: boolean; + /** Full-text inclusion with a `query_by` weight (folded; per-locale when + * `localized`). Presence is what makes a field searchable. */ + readonly searchable?: { readonly weight: number }; + /** Usable in `where`. */ + readonly filterable?: boolean; + /** Returned as facet buckets. */ + readonly facetable?: boolean; + /** Publicly selectable in `orderBy`; localized text also emits a folded sort key. */ + readonly sortable?: boolean; + /** For `kind: 'reference'`: the referenced shape and how much of it to carry. */ + readonly ref?: { + readonly type: string; + readonly strategy: 'labelOnly' | 'idOnly' | 'inline'; + }; + /** Projection-time value transform (e.g. strip a media-type prefix). */ + readonly transform?: (value: string) => string; + /** Grouped-facet companion (a coarse `${name}_group`; deployment delta). */ + readonly group?: { readonly name: string; readonly prefix: string }; +} + +/** + * A computed field that is not a direct projection of a single path — a status + * rank, a `*_group` derived from a code table, a compatibility boolean. Reads + * the framed node and writes onto the flat document the field specs already + * populated. + */ +export type Derivation = (document: SearchDocument, node: FramedNode) => void; + +/** + * One root type’s complete search declaration — the runtime form of a single + * SHACL NodeShape: `type` is its `sh:targetClass`, `fields` are its property + * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed + * fields. A generator emits one of these per NodeShape. + */ +export interface SearchSchema { + readonly type: string; + readonly fields: readonly SearchField[]; + readonly derivations?: readonly Derivation[]; +} + +/** + * The physical engine fields one {@link SearchField} fans out into, grouped by + * the role each plays. The single source of truth for the naming convention, so + * the projection (writes them), the collection schema (declares them) and the + * query compiler (reads them) cannot disagree. + */ +export interface PhysicalFields { + /** The lone stored field for a non-localized kind — faceted, filtered, sorted + * and output directly. Absent for localized text (its value lives per locale). */ + readonly value?: string; + /** Per-locale output labels `${name}_${locale}` (localized text, `output`). */ + readonly display: readonly string[]; + /** Folded match fields: `${name}_search_${locale}` per locale (localized) or a + * single `${name}_search` (non-localized), when `searchable`. */ + readonly search: readonly string[]; + /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, + * `sortable`); a non-localized field sorts on its `value`. */ + readonly sort: readonly string[]; + /** The grouped-facet companion `${name}_group`, when `group` is declared. */ + readonly group?: string; +} + +/** + * Full-text searchable fields, highest `query_by` weight first — the order the + * engine adapter weights `query_by` in. A field is searchable iff it carries a + * `searchable` weight. + */ +export function searchableFields( + schema: SearchSchema, +): readonly (SearchField & { + readonly searchable: { readonly weight: number }; +})[] { + return schema.fields + .filter( + (field): field is SearchField & { searchable: { weight: number } } => + field.searchable !== undefined, + ) + .sort((left, right) => right.searchable.weight - left.searchable.weight); +} + +/** Fields returned as facet buckets, in declaration order. */ +export function facetableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.facetable === true); +} + +/** Fields usable in `where`, in declaration order. */ +export function filterableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.filterable === true); +} + +/** Fields publicly selectable in `orderBy`, in declaration order. */ +export function sortableFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.sortable === true); +} + +/** Fields that appear in the API output type, in declaration order. */ +export function outputFields(schema: SearchSchema): readonly SearchField[] { + return schema.fields.filter((field) => field.output === true); +} + +/** Derive the physical engine field names a declaration produces. */ +export function physicalFields(field: SearchField): PhysicalFields { + const localized = field.kind === 'text' && field.localized === true; + const locales = localized ? (field.locales ?? []) : []; + return { + // Localized text has no single value field — its values live in the + // per-locale fields; every other kind stores into one `${name}` field. + value: localized ? undefined : field.name, + display: + localized && field.output + ? locales.map((locale) => `${field.name}_${locale}`) + : [], + search: field.searchable + ? localized + ? locales.map((locale) => `${field.name}_search_${locale}`) + : [`${field.name}_search`] + : [], + sort: + localized && field.sortable + ? locales.map((locale) => `${field.name}_sort_${locale}`) + : [], + group: field.group ? `${field.name}_group` : undefined, + }; +} diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts new file mode 100644 index 00000000..54ad819d --- /dev/null +++ b/packages/search/test/engine.test.ts @@ -0,0 +1,110 @@ +import { describe, expect, it } from 'vitest'; +import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; +import type { SearchQuery } from '../src/query.js'; +import type { SearchSchema } from '../src/schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], +}; + +// A fake engine: the port is implementable and the result types compose into a +// logical document (language map + reference) the way a real engine returns. +const fake: SearchEngine = { + async search(query: SearchQuery): Promise { + return { + total: 1, + hits: [ + { + id: 'https://example/dataset/1', + document: { + title: { nl: ['Erfgoed'], und: [query.text ?? ''] }, + publisher: { + id: 'https://example/org/1', + label: { nl: ['Archief'] }, + }, + keyword: ['kaarten', 'atlas'], + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, + }; + }, +}; + +describe('SearchEngine port', () => { + it('returns logical hits, total and facets through the port', async () => { + const query: SearchQuery = { + text: 'kaart', + where: [], + orderBy: [{ field: 'relevance', direction: 'desc' }], + limit: 20, + offset: 0, + facets: ['keyword'], + locale: 'nl', + }; + + const result = await fake.search(query, schema); + + expect(result.total).toBe(1); + expect(result.hits[0].id).toBe('https://example/dataset/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Erfgoed'], + und: ['kaart'], + }); + expect(result.facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); +}); + +describe('typed facet and document keys', () => { + it('keys facets and the result document by the schema’s field names', async () => { + // Captured as a literal (`as const satisfies`) so the `facetable`/`output` + // flags survive and the `…Of` helpers can read the field names off the type. + const datasetSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + { name: 'format', kind: 'keyword', array: true, facetable: true }, + { name: 'status', kind: 'keyword', facetable: true }, + ], + } as const satisfies SearchSchema; + + // facets ⊂ { format, status }, document keys ⊂ { title }. These object + // literals would not compile if the helpers widened to `string`/`never`. + const engine: EngineFor = { + async search() { + return { + total: 1, + hits: [ + { + id: 'https://example/d/1', + document: { title: { nl: ['Titel'] } }, + }, + ], + facets: { format: [{ value: 'text/turtle', count: 2 }] }, + }; + }, + }; + + const result = await engine.search( + { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: ['format'], + locale: 'nl', + }, + datasetSchema, + ); + + expect(result.facets.format).toEqual([{ value: 'text/turtle', count: 2 }]); + expect(result.hits[0].document.title).toEqual({ nl: ['Titel'] }); + }); +}); diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 60c42f71..8f513baa 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -5,11 +5,9 @@ import { projectDocument, projectGraph, irisOf, - type FieldSpec, - type Derivation, - type Projection, type SearchDocument, } from '../src/project.js'; +import type { SearchField, SearchSchema, Derivation } from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -30,49 +28,50 @@ const node = { [`${DR}size`]: { '@type': xsd.integer.value, '@value': '1234' }, }; -const fields: FieldSpec[] = [ +const fields: SearchField[] = [ { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, - sort: true, + output: true, + searchable: { weight: 1 }, + sortable: true, }, { name: 'publisher', path: `${DR}publisherName`, - type: 'langText', + kind: 'text', + localized: true, locales: ['nl', 'en'], - display: true, - search: true, + output: true, + searchable: { weight: 1 }, }, { name: 'publisher', path: dcterms.publisher.value, - type: 'facet', - iri: true, + kind: 'reference', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'format', path: `${DR}format`, - type: 'facet', + kind: 'keyword', transform: (value) => value.replace(IANA, ''), }, - { name: 'class', path: `${DR}class`, type: 'facet', iri: true }, + { name: 'class', path: `${DR}class`, kind: 'reference' }, { name: 'date_posted', path: `${DR}datePosted`, - type: 'date', + kind: 'date', }, - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, ]; const derivations: Derivation[] = [ @@ -81,11 +80,11 @@ const derivations: Derivation[] = [ }, ]; -const projection: Projection = { type: DATASET, fields, derivations }; +const schema: SearchSchema = { type: DATASET, fields, derivations }; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { - const document = projectDocument(node, projection); + const document = projectDocument(node, schema); expect(document.id).toBe('https://ex/d/1'); expect(document.title_nl).toBe('Titel'); @@ -121,23 +120,22 @@ describe('projectDocument', () => { { type: DATASET, fields: [ - { name: 'size', path: `${DR}size`, type: 'number' }, + { name: 'size', path: `${DR}size`, kind: 'integer' }, { name: 'language', path: dcterms.language.value, - type: 'facet', + kind: 'keyword', }, { name: 'keyword', path: dcat.keyword.value, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, }, { name: 'class', path: `${DR}class`, - type: 'facet', - iri: true, + kind: 'reference', }, ], }, @@ -157,8 +155,8 @@ describe('projectDocument', () => { { name: 'format', path: `${DR}format`, - type: 'facet', - search: true, + kind: 'keyword', + searchable: { weight: 1 }, transform: (value) => value.replace(IANA, ''), }, ], @@ -232,10 +230,11 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - // search only — display and sort not opted into. - type: 'langText', + // search only — display (output) and sort not opted into. + kind: 'text', + localized: true, locales: ['nl', 'en'], - search: true, + searchable: { weight: 1 }, }, ], }, @@ -262,6 +261,38 @@ describe('projectDocument', () => { expect(document.title_search_nl).toBe('titel ondertitel'); }); + it('skips a field with no path, leaving it to a derivation (derived field)', () => { + const document = projectDocument( + { + '@id': 'https://ex/d/11', + [dcterms.title.value]: { '@language': 'nl', '@value': 'Titel' }, + }, + { + type: DATASET, + fields: [ + { + name: 'title', + path: dcterms.title.value, + kind: 'text', + localized: true, + locales: ['nl'], + output: true, + }, + // No `path`: a derived field — its value comes from a derivation, + // never from projection. + { name: 'status', kind: 'keyword', facetable: true }, + ], + derivations: [ + (derived) => { + derived.status = 'valid'; + }, + ], + }, + ); + expect(document.title_nl).toBe('Titel'); + expect(document.status).toBe('valid'); + }); + it('throws when the framed node has no @id', () => { expect(() => projectDocument( @@ -284,7 +315,8 @@ describe('projectDocument', () => { { name: 'title', path: dcterms.title.value, - type: 'langText', + kind: 'text', + localized: true, locales: [], }, ], @@ -295,7 +327,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each projection’s type and projects matching nodes', async () => { + it('frames each schema’s type and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . diff --git a/packages/search/test/query.test.ts b/packages/search/test/query.test.ts new file mode 100644 index 00000000..b82042f5 --- /dev/null +++ b/packages/search/test/query.test.ts @@ -0,0 +1,78 @@ +import { describe, expect, it } from 'vitest'; +import { acceptsFilter, filterOperatorFor } from '../src/query.js'; +import type { SearchField } from '../src/schema.js'; + +const keyword: SearchField = { + name: 'format', + kind: 'keyword', + array: true, + filterable: true, +}; +const datePosted: SearchField = { + name: 'datePosted', + kind: 'date', + filterable: true, +}; +const status: SearchField = { + name: 'status', + kind: 'keyword', + facetable: true, +}; +const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl'], + filterable: true, +}; + +describe('filterOperatorFor', () => { + it('maps each field kind to its `where` operator', () => { + expect(filterOperatorFor('text')).toBeUndefined(); + expect(filterOperatorFor('keyword')).toBe('in'); + expect(filterOperatorFor('reference')).toBe('in'); + expect(filterOperatorFor('integer')).toBe('range'); + expect(filterOperatorFor('number')).toBe('range'); + expect(filterOperatorFor('date')).toBe('range'); + expect(filterOperatorFor('boolean')).toBe('is'); + }); +}); + +describe('acceptsFilter', () => { + it('accepts a filter whose shape matches the field’s operator', () => { + expect( + acceptsFilter(keyword, { field: 'format', in: ['text/turtle'] }), + ).toBe(true); + expect( + acceptsFilter(datePosted, { + field: 'datePosted', + range: { min: '2024' }, + }), + ).toBe(true); + }); + + it('rejects a filter whose shape does not match the field’s operator', () => { + expect(acceptsFilter(keyword, { field: 'format', range: { min: 1 } })).toBe( + false, + ); + }); + + it('rejects a filter on a non-filterable field', () => { + expect(acceptsFilter(status, { field: 'status', in: ['valid'] })).toBe( + false, + ); + }); + + it('rejects any filter on a text field (it feeds the free-text query)', () => { + expect(acceptsFilter(title, { field: 'title', in: ['x'] })).toBe(false); + }); + + it('accepts an `is` filter on a filterable boolean field', () => { + const iiif: SearchField = { + name: 'iiif', + kind: 'boolean', + filterable: true, + }; + expect(acceptsFilter(iiif, { field: 'iiif', is: true })).toBe(true); + }); +}); diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts new file mode 100644 index 00000000..bd52d449 --- /dev/null +++ b/packages/search/test/schema.test.ts @@ -0,0 +1,209 @@ +import { describe, expect, it } from 'vitest'; +import { + facetableFields, + filterableFields, + outputFields, + physicalFields, + searchableFields, + sortableFields, + type SearchField, + type SearchSchema, +} from '../src/schema.js'; + +const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; + +const schema: SearchSchema = { + type: DATASET, + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + }, + { + name: 'datePosted', + kind: 'date', + output: true, + filterable: true, + sortable: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('physicalFields', () => { + it('fans a localized text field out into per-locale display, search and sort keys', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: ['title_nl', 'title_en'], + search: ['title_search_nl', 'title_search_en'], + sort: ['title_sort_nl', 'title_sort_en'], + }); + }); + + it('gives a searchable keyword facet one value field and one folded search field', () => { + const keyword: SearchField = { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }; + + expect(physicalFields(keyword)).toEqual({ + value: 'keyword', + display: [], + search: ['keyword_search'], + sort: [], + }); + }); + + it('adds the `${name}_group` companion when a field declares a group', () => { + const format: SearchField = { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + group: { + name: 'format_group', + prefix: 'https://www.iana.org/assignments/media-types/', + }, + }; + + expect(physicalFields(format)).toEqual({ + value: 'format', + display: [], + search: [], + sort: [], + group: 'format_group', + }); + }); + + it('emits only the search keys for a search-only localized field (no display, no sort)', () => { + const creator: SearchField = { + name: 'creator', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + searchable: { weight: 2 }, + }; + + expect(physicalFields(creator)).toEqual({ + display: [], + search: ['creator_search_nl', 'creator_search_en'], + sort: [], + }); + }); + + it('emits no per-locale fields when a localized field declares no locales', () => { + const title: SearchField = { + name: 'title', + kind: 'text', + localized: true, + output: true, + searchable: { weight: 5 }, + sortable: true, + }; + + expect(physicalFields(title)).toEqual({ + display: [], + search: [], + sort: [], + }); + }); + + it('stores a reference field in one value field', () => { + const publisher: SearchField = { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }; + + expect(physicalFields(publisher)).toEqual({ + value: 'publisher', + display: [], + search: [], + sort: [], + }); + }); +}); + +describe('schema selectors', () => { + it('orders searchable fields by descending weight', () => { + expect(searchableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'keyword', + ]); + }); + + it('selects facetable, filterable, sortable and output fields by capability', () => { + expect(facetableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'status', + ]); + expect(filterableFields(schema).map((field) => field.name)).toEqual([ + 'keyword', + 'format', + 'datePosted', + 'status', + ]); + expect(sortableFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'datePosted', + ]); + expect(outputFields(schema).map((field) => field.name)).toEqual([ + 'title', + 'description', + 'datePosted', + 'status', + ]); + }); +}); From 00b5ba86ae6f7af69969e18e480d2c9c76b7d4d9 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:22:54 +0200 Subject: [PATCH 02/13] feat(search-typesense): add collection-schema builder, query compiler and SearchEngine - buildCollectionSchema derives a Typesense collection from the unified SearchField model - buildSearchParams compiles SearchQuery into Typesense params (filter_by/sort_by/facet_by/query_by) - createTypesenseSearchEngine implements the SearchEngine port: compile, search, reconstruct - resolve reference and reference-facet labels from the sidecar labels collection in one lookup - add a testcontainer integration test and a generator-stability snapshot --- packages/search-typesense/README.md | 30 +- packages/search-typesense/package.json | 4 +- .../search-typesense/src/collection-schema.ts | 144 ++++++++++ packages/search-typesense/src/index.ts | 8 + .../search-typesense/src/query-compiler.ts | 202 +++++++++++++ packages/search-typesense/src/search.ts | 265 ++++++++++++++++++ .../generator-stability.test.ts.snap | 120 ++++++++ .../test/collection-schema.test.ts | 202 +++++++++++++ .../test/generator-stability.test.ts | 66 +++++ .../test/parse-response.test.ts | 145 ++++++++++ .../test/query-compiler.test.ts | 156 +++++++++++ .../test/search-engine.test.ts | 226 +++++++++++++++ packages/search-typesense/tsconfig.lib.json | 5 +- 13 files changed, 1563 insertions(+), 10 deletions(-) create mode 100644 packages/search-typesense/src/collection-schema.ts create mode 100644 packages/search-typesense/src/query-compiler.ts create mode 100644 packages/search-typesense/src/search.ts create mode 100644 packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap create mode 100644 packages/search-typesense/test/collection-schema.test.ts create mode 100644 packages/search-typesense/test/generator-stability.test.ts create mode 100644 packages/search-typesense/test/parse-response.test.ts create mode 100644 packages/search-typesense/test/query-compiler.test.ts create mode 100644 packages/search-typesense/test/search-engine.test.ts diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index b5d62bb9..ea681cae 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -1,13 +1,27 @@ # @lde/search-typesense -[Typesense](https://typesense.org/) engine adapter for RDF-backed search -pipelines. Engine-specific (Typesense) but domain-agnostic – the caller supplies -the collection schema and documents. - -The engine-agnostic half of the pipeline – framing `CONSTRUCT` quads into a -JSON-LD IR and projecting that IR into flat documents from a declarative field -spec – lives in [`@lde/search`](../search). This package consumes those -documents and writes them to Typesense. +[Typesense](https://typesense.org/) engine adapter for the engine- and +domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but +domain-agnostic** – you supply a `SearchSchema`; this package never names your +domain. It is the Typesense implementation of the `SearchEngine` port: it derives +a collection schema from the field model, compiles the neutral `SearchQuery` into +Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, +and manages the index lifecycle (blue/green rebuild). + +## Collection schema and engine + +`buildCollectionSchema(schema, { name, defaultSortingField, … })` derives a +Typesense collection from the unified `SearchField` model — the Typesense field +type comes from each field’s `kind`, and the physical fanout (per-locale +search/sort keys, the `_group` companion) matches what the projection writes, via +`@lde/search`’s `physicalFields`, so the index and the documents cannot drift. + +`createTypesenseSearchEngine(client, { collection, labelsCollection })` is the +`SearchEngine` implementation: it compiles the query, runs the search, resolves +reference (and reference-facet) labels from the sidecar `labels` collection in a +single lookup, and reconstructs the logical `SearchResult` — language maps, +labelled references, labelled facet buckets. The pure halves `buildSearchParams` +and `parseSearchResponse` are exported for direct use and testing. ## Indexing diff --git a/packages/search-typesense/package.json b/packages/search-typesense/package.json index b1dde852..445624fb 100644 --- a/packages/search-typesense/package.json +++ b/packages/search-typesense/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-typesense", "version": "0.1.1", - "description": "Generic Typesense engine adapter for RDF-backed search pipelines: collection lifecycle, bulk upsert and blue/green alias swap", + "description": "Typesense implementation of the @lde/search SearchEngine port: collection-schema builder, query compiler, label-resolving result reconstruction, and blue/green index lifecycle. Engine-specific (Typesense) but domain-agnostic.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-typesense" @@ -25,6 +25,8 @@ "!**/*.tsbuildinfo" ], "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts new file mode 100644 index 00000000..5141f634 --- /dev/null +++ b/packages/search-typesense/src/collection-schema.ts @@ -0,0 +1,144 @@ +import type { CollectionCreateSchema } from 'typesense'; +import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; +import { + physicalFields, + type SearchField, + type SearchSchema, +} from '@lde/search'; + +/** Deployment-specific options the generic field model does not carry. */ +export interface CollectionSchemaOptions { + /** The Typesense collection (or alias) name. */ + readonly name: string; + /** Snowball stemming locale for non-localized searchable fields (default `nl`). + * Localized text search fields stem in their own locale. */ + readonly defaultLocale?: string; + /** The field Typesense sorts by when a query imposes no order. */ + readonly defaultSortingField?: string; + /** Synonym sets the collection references (synced separately). */ + readonly synonymSets?: readonly string[]; +} + +/** + * Build a Typesense collection schema from the unified {@link SearchSchema}, so + * the index and the projection are driven by one declarative source and cannot + * drift. Each field fans out into the same physical fields the projection writes + * ({@link physicalFields}); the Typesense field type is derived from the field + * `kind`, never re-declared. + * + * Stemming is enabled on every folded `*_search` field: localized text stems + * each `*_search_${locale}` in its own language, and a non-localized searchable + * field stems in `defaultLocale`. + */ +export function buildCollectionSchema( + schema: SearchSchema, + options: CollectionSchemaOptions, +): CollectionCreateSchema { + const defaultLocale = options.defaultLocale ?? 'nl'; + const collection: CollectionCreateSchema = { + name: options.name, + fields: schema.fields.flatMap((field) => + typesenseFields(field, defaultLocale, options.defaultSortingField), + ), + }; + if (options.defaultSortingField !== undefined) { + collection.default_sorting_field = options.defaultSortingField; + } + if (options.synonymSets !== undefined) { + collection.synonym_sets = [...options.synonymSets]; + } + return collection; +} + +/** The physical Typesense fields one declaration produces. */ +function typesenseFields( + field: SearchField, + defaultLocale: string, + defaultSortingField: string | undefined, +): CollectionFieldSchema[] { + const names = physicalFields(field); + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + return [ + // Display labels: stored, not indexed for search (search uses the folded + // companions), accents preserved. + ...names.display.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + index: false, + optional: true, + }), + ), + // One folded search field per locale, each stemmed in its own language. + ...names.search.map( + (name, index): CollectionFieldSchema => ({ + name, + type: 'string', + optional: true, + stem: true, + locale: locales[index], + }), + ), + ...names.sort.map( + (name): CollectionFieldSchema => ({ + name, + type: 'string', + sort: true, + optional: true, + }), + ), + ]; + } + + const valueType = typesenseValueType(field); + const fields: CollectionFieldSchema[] = [ + { + name: field.name, + type: valueType, + facet: field.facetable ?? false, + sort: field.sortable ?? false, + // A `required` field is non-optional; so is the `default_sorting_field`, + // which Typesense requires to be present. Everything else may be absent. + optional: field.required !== true && field.name !== defaultSortingField, + }, + ]; + if (field.searchable) { + for (const name of names.search) { + fields.push({ + name, + type: valueType, + optional: true, + stem: true, + locale: defaultLocale, + }); + } + } + if (names.group !== undefined) { + fields.push({ + name: names.group, + type: valueType, + facet: true, + optional: true, + }); + } + return fields; +} + +/** The Typesense field type for a non-localized field, from its `kind`. 64-bit + * integers (and dates, stored as Unix seconds) so large counts never overflow. */ +function typesenseValueType(field: SearchField): CollectionFieldSchema['type'] { + switch (field.kind) { + case 'integer': + case 'date': + return 'int64'; + case 'number': + return 'float'; + case 'boolean': + return 'bool'; + case 'keyword': + case 'reference': + case 'text': + return field.array === true ? 'string[]' : 'string'; + } +} diff --git a/packages/search-typesense/src/index.ts b/packages/search-typesense/src/index.ts index 6514638d..66247957 100644 --- a/packages/search-typesense/src/index.ts +++ b/packages/search-typesense/src/index.ts @@ -1 +1,9 @@ export { rebuild } from './adapter.js'; +export { buildCollectionSchema } from './collection-schema.js'; +export type { CollectionSchemaOptions } from './collection-schema.js'; +export { buildSearchParams } from './query-compiler.js'; +export { createTypesenseSearchEngine, parseSearchResponse } from './search.js'; +export type { + TypesenseSearchEngineOptions, + TypesenseSearchResponse, +} from './search.js'; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts new file mode 100644 index 00000000..fc9d4950 --- /dev/null +++ b/packages/search-typesense/src/query-compiler.ts @@ -0,0 +1,202 @@ +import type { SearchParams } from 'typesense/lib/Typesense/Documents.js'; +import { fold } from '@lde/text-normalization'; +import { + physicalFields, + searchableFields, + type Filter, + type SearchField, + type SearchQuery, + type SearchSchema, + type Sort, +} from '@lde/search'; + +/** + * Compile the engine-neutral {@link SearchQuery} into Typesense search + * parameters — the query half of the engine adapter. Pure (no client, no env), + * so the mapping is asserted directly in unit tests. Field names come from + * {@link physicalFields}, the same convention the projection and the collection + * schema use, so a query can never reference a field the index does not carry. + */ +export function buildSearchParams( + query: SearchQuery, + schema: SearchSchema, +): SearchParams { + const folded = + query.text !== undefined && query.text.length > 0 + ? fold(query.text) + : undefined; + const { names, weights } = queryFields(schema, query.locale); + const filterBy = compileFilterBy(query.where, schema); + const sortBy = query.orderBy + .map((sort) => compileSort(sort, schema, query.locale)) + .join(','); + const params: SearchParams = { + q: folded ?? '*', + query_by: names.join(','), + query_by_weights: weights.join(','), + per_page: query.limit, + page: Math.floor(query.offset / query.limit) + 1, + }; + if (filterBy.length > 0) { + params.filter_by = filterBy; + } + if (sortBy.length > 0) { + params.sort_by = sortBy; + } + if (query.facets.length > 0) { + params.facet_by = query.facets.join(','); + } + return params; +} + +/** + * The `query_by` fields and aligned weights. Each searchable field expands to its + * folded `*_search` companion(s); a localized field’s active-locale companion + * keeps its full weight while the other locale is gently demoted (−1, floored at + * 1), so a match in the user’s language ranks higher while cross-language matches + * still surface. + */ +function queryFields( + schema: SearchSchema, + locale: string, +): { readonly names: string[]; readonly weights: number[] } { + const names: string[] = []; + const weights: number[] = []; + for (const field of searchableFields(schema)) { + const search = physicalFields(field).search; + const baseWeight = field.searchable.weight; + if (field.kind === 'text' && field.localized === true) { + const locales = field.locales ?? []; + search.forEach((name, index) => { + names.push(name); + weights.push( + locales[index] === locale ? baseWeight : Math.max(1, baseWeight - 1), + ); + }); + } else { + for (const name of search) { + names.push(name); + weights.push(baseWeight); + } + } + } + return { names, weights }; +} + +/** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ +function compileFilterBy( + where: readonly Filter[], + schema: SearchSchema, +): string { + return where + .map((filter) => compileFilter(filter, schema)) + .filter((clause): clause is string => clause !== undefined) + .join(' && '); +} + +function compileFilter( + filter: Filter, + schema: SearchSchema, +): string | undefined { + const field = schema.fields.find( + (candidate) => candidate.name === filter.field, + ); + if (field === undefined) { + return undefined; + } + if ('in' in filter) { + return filter.in.length > 0 + ? compileMembership(field, filter.in) + : undefined; + } + if ('range' in filter) { + return compileRange(field.name, filter.range); + } + return `${field.name}:=${filter.is}`; +} + +/** + * A membership clause. A grouped field splits its values into `prefix`-tagged + * group tokens (matched against the `_group` companion) and granular values, and + * ORs the two so selecting a value and a group within one facet unions instead of + * intersecting. A non-facet (tokenized) field uses the exact `:=` operator so an + * IRI cannot partial-match on a shared path segment. + */ +function compileMembership( + field: SearchField, + values: readonly string[], +): string { + const exact = field.facetable !== true; + if (field.group !== undefined) { + const prefix = field.group.prefix; + const groups = values.filter((value) => value.startsWith(prefix)); + const granular = values.filter((value) => !value.startsWith(prefix)); + const parts: string[] = []; + if (granular.length > 0) { + parts.push(membership(field.name, granular, exact)); + } + if (groups.length > 0) { + parts.push(membership(field.group.name, groups, false)); + } + return parts.length > 1 ? `(${parts.join(' || ')})` : parts[0]; + } + return membership(field.name, values, exact); +} + +function membership( + name: string, + values: readonly string[], + exact: boolean, +): string { + const list = `[${values.map(escapeFilterValue).join(',')}]`; + return exact ? `${name}:=${list}` : `${name}:${list}`; +} + +/** An inclusive Typesense range clause, or `undefined` when neither bound is set. */ +function compileRange( + name: string, + range: { readonly min?: number | string; readonly max?: number | string }, +): string | undefined { + const { min, max } = range; + if (min !== undefined && max !== undefined) { + return `${name}:[${min}..${max}]`; + } + if (min !== undefined) { + return `${name}:>=${min}`; + } + if (max !== undefined) { + return `${name}:<=${max}`; + } + return undefined; +} + +/** + * One `sort_by` term. `relevance` maps to Typesense’s `_text_match`; a localized + * text field sorts on its active-locale folded key; any other field (including a + * deployment tie-break like `status_rank`) sorts on its own name. + */ +function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { + if (sort.field === 'relevance') { + return `_text_match:${sort.direction}`; + } + const field = schema.fields.find( + (candidate) => candidate.name === sort.field, + ); + if ( + field !== undefined && + field.kind === 'text' && + field.localized === true + ) { + return `${field.name}_sort_${locale}:${sort.direction}`; + } + return `${sort.field}:${sort.direction}`; +} + +/** + * Backtick-wrap a filter value so reserved characters in IRIs and media types + * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. + * An embedded backtick is escaped. + */ +function escapeFilterValue(value: string): string { + return `\`${value.replace(/`/g, '\\`')}\``; +} diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts new file mode 100644 index 00000000..e9d792c7 --- /dev/null +++ b/packages/search-typesense/src/search.ts @@ -0,0 +1,265 @@ +import type { Client } from 'typesense'; +import { + outputFields, + type FacetBucket, + type LocalizedValue, + type Reference, + type ResultDocument, + type SearchEngine, + type SearchField, + type SearchHit, + type SearchQuery, + type SearchResult, + type SearchSchema, + type SearchValue, +} from '@lde/search'; +import { buildSearchParams } from './query-compiler.js'; + +/** Where the engine reads documents and (optionally) reference labels. */ +export interface TypesenseSearchEngineOptions { + /** The dataset collection or alias to query. */ + readonly collection: string; + /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ + readonly labelsCollection?: string; +} + +/** + * A Typesense-backed {@link SearchEngine}. `search` compiles the query + * ({@link buildSearchParams}), runs it, resolves the reference labels for the + * page of hits from the sidecar `labels` collection in one lookup, and + * reconstructs the engine-neutral {@link SearchResult} ({@link parseSearchResponse}). + * Every engine specific stays here; consumers see only logical documents. + */ +export function createTypesenseSearchEngine( + client: Client, + options: TypesenseSearchEngineOptions, +): SearchEngine { + return { + async search( + query: SearchQuery, + schema: SearchSchema, + ): Promise { + const params = buildSearchParams(query, schema); + const response = (await client + .collections(options.collection) + .documents() + .search(params)) as TypesenseSearchResponse; + const labels = + options.labelsCollection !== undefined + ? await fetchLabels( + client, + options.labelsCollection, + referenceIris(response, schema), + ) + : new Map(); + return parseSearchResponse(response, schema, labels); + }, + }; +} + +/** Every distinct reference IRI across the page of hits. */ +function referenceIris( + response: TypesenseSearchResponse, + schema: SearchSchema, +): string[] { + const referenceFields = schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name); + const referenceFieldSet = new Set(referenceFields); + const iris = new Set(); + for (const hit of response.hits ?? []) { + for (const name of referenceFields) { + const raw = hit.document[name]; + if (Array.isArray(raw)) { + for (const value of raw) { + iris.add(String(value)); + } + } else if (typeof raw === 'string') { + iris.add(raw); + } + } + } + // Reference-facet bucket values are IRIs too; resolve them in the same lookup. + for (const facet of response.facet_counts ?? []) { + if (referenceFieldSet.has(facet.field_name)) { + for (const bucket of facet.counts) { + iris.add(bucket.value); + } + } + } + return [...iris]; +} + +/** + * Resolve labels for `iris` from the sidecar `labels` collection in a single + * `filter_by: id:[…]` lookup. Each `label_${locale}` becomes a language-map + * entry; the default `label` is the untagged (`und`) fallback when no locale + * variant exists. + */ +async function fetchLabels( + client: Client, + collection: string, + iris: readonly string[], +): Promise> { + const labels = new Map(); + if (iris.length === 0) { + return labels; + } + const filter = `id:[${iris.map((iri) => `\`${iri.replace(/`/g, '\\`')}\``).join(',')}]`; + const response = (await client.collections(collection).documents().search({ + q: '*', + query_by: 'label', + filter_by: filter, + per_page: iris.length, + })) as TypesenseSearchResponse; + for (const hit of response.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } + return labels; +} + +/** Turn a `labels` document into a language map (`label_${locale}` → locale). */ +function labelToLocalizedValue( + document: Record, +): LocalizedValue { + const map: Record = {}; + for (const [key, value] of Object.entries(document)) { + if (key.startsWith('label_') && typeof value === 'string') { + map[key.slice('label_'.length)] = [value]; + } + } + if (Object.keys(map).length === 0 && typeof document.label === 'string') { + map.und = [document.label]; + } + return map; +} + +/** The subset of a Typesense search response this adapter reads. */ +export interface TypesenseSearchResponse { + readonly found: number; + readonly hits?: readonly { readonly document: Record }[]; + readonly facet_counts?: readonly { + readonly field_name: string; + readonly counts: readonly { + readonly value: string; + readonly count: number; + }[]; + }[]; +} + +/** + * Reconstruct a Typesense response into the engine-neutral {@link SearchResult}: + * the flat, fanned-out document is turned back into a logical one (per-locale + * display fields → a language map, reference IRIs → labelled references via the + * sidecar `labels` lookup, scalars passed through). `labels` maps a reference IRI + * to its resolved label; an IRI absent from it yields an id-only reference. + */ +export function parseSearchResponse( + response: TypesenseSearchResponse, + schema: SearchSchema, + labels: ReadonlyMap, +): SearchResult { + const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ + id: String(hit.document.id), + document: reconstructDocument(hit.document, schema, labels), + })); + // Reference facets are IRI-keyed; their buckets carry a resolved data label. + // Plain facets (tokens, free strings) carry no label — the consumer owns display. + const referenceFacets = new Set( + schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + const facets: Record = {}; + for (const facet of response.facet_counts ?? []) { + const labelled = referenceFacets.has(facet.field_name); + facets[facet.field_name] = facet.counts.map((bucket) => { + const label = labelled ? labels.get(bucket.value) : undefined; + return label === undefined + ? { value: bucket.value, count: bucket.count } + : { value: bucket.value, count: bucket.count, label }; + }); + } + return { hits, total: response.found, facets }; +} + +/** Rebuild one logical document from a flat Typesense document. */ +function reconstructDocument( + flat: Record, + schema: SearchSchema, + labels: ReadonlyMap, +): ResultDocument { + const document: Record = {}; + for (const field of outputFields(schema)) { + if (field.kind === 'boolean') { + // A boolean is always present; an absent value means false. + document[field.name] = flat[field.name] === true; + continue; + } + const value = logicalValue(flat, field, labels); + if (value !== undefined) { + document[field.name] = value; + } + } + return document; +} + +function logicalValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + switch (field.kind) { + case 'text': + return localizedValue(flat, field); + case 'reference': + return referenceValue(flat, field, labels); + case 'keyword': { + const value = flat[field.name]; + return Array.isArray(value) || typeof value === 'string' + ? (value as SearchValue) + : undefined; + } + case 'integer': + case 'number': + case 'date': { + const value = flat[field.name]; + return typeof value === 'number' ? value : undefined; + } + case 'boolean': + return flat[field.name] === true; + } +} + +/** Gather the per-locale display fields back into a language map. */ +function localizedValue( + flat: Record, + field: SearchField, +): LocalizedValue | undefined { + const map: Record = {}; + for (const locale of field.locales ?? []) { + const value = flat[`${field.name}_${locale}`]; + if (typeof value === 'string') { + map[locale] = [value]; + } + } + return Object.keys(map).length > 0 ? map : undefined; +} + +/** Map stored reference IRIs to labelled references; id-only when no label. */ +function referenceValue( + flat: Record, + field: SearchField, + labels: ReadonlyMap, +): SearchValue | undefined { + const raw = flat[field.name]; + if (raw === undefined) { + return undefined; + } + const iris = Array.isArray(raw) ? (raw as string[]) : [String(raw)]; + const references: Reference[] = iris.map((iri) => { + const label = labels.get(iri); + return label === undefined ? { id: iri } : { id: iri, label }; + }); + return field.array === true ? references : references[0]; +} diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..201512f7 --- /dev/null +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,120 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`collection-schema generator stability > derives a stable Typesense collection for a representative schema 1`] = ` +{ + "default_sorting_field": "size", + "fields": [ + { + "index": false, + "name": "title_nl", + "optional": true, + "type": "string", + }, + { + "index": false, + "name": "title_en", + "optional": true, + "type": "string", + }, + { + "locale": "nl", + "name": "title_search_nl", + "optional": true, + "stem": true, + "type": "string", + }, + { + "locale": "en", + "name": "title_search_en", + "optional": true, + "stem": true, + "type": "string", + }, + { + "name": "title_sort_nl", + "optional": true, + "sort": true, + "type": "string", + }, + { + "name": "title_sort_en", + "optional": true, + "sort": true, + "type": "string", + }, + { + "facet": true, + "name": "keyword", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "locale": "nl", + "name": "keyword_search", + "optional": true, + "stem": true, + "type": "string[]", + }, + { + "facet": true, + "name": "format", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "format_group", + "optional": true, + "type": "string[]", + }, + { + "facet": true, + "name": "creator", + "optional": true, + "sort": false, + "type": "string[]", + }, + { + "facet": true, + "name": "status", + "optional": false, + "sort": false, + "type": "string", + }, + { + "facet": true, + "name": "size", + "optional": false, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "score", + "optional": true, + "sort": false, + "type": "float", + }, + { + "facet": false, + "name": "created", + "optional": true, + "sort": true, + "type": "int64", + }, + { + "facet": true, + "name": "open", + "optional": true, + "sort": false, + "type": "bool", + }, + ], + "name": "things", + "synonym_sets": [ + "things-synonyms", + ], +} +`; diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts new file mode 100644 index 00000000..51511122 --- /dev/null +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -0,0 +1,202 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + path: 'https://def.nde.nl/format', + kind: 'keyword', + array: true, + facetable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + // Derived fields (no path) still get collection fields — populated at index + // time by derivations, not projected. + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + { + name: 'size', + kind: 'integer', + facetable: true, + sortable: true, + }, + { name: 'iiif', kind: 'boolean', facetable: true }, + { + name: 'publisher', + path: 'http://purl.org/dc/terms/publisher', + kind: 'reference', + array: true, + facetable: true, + }, + { + name: 'datePosted', + path: 'https://def.nde.nl/datePosted', + kind: 'date', + sortable: true, + }, + { + name: 'score', + kind: 'number', + facetable: true, + }, + ], +}; + +describe('buildCollectionSchema', () => { + const collection = buildCollectionSchema(schema, { + name: 'datasets', + defaultLocale: 'nl', + defaultSortingField: 'statusRank', + synonymSets: ['dataset-synonyms'], + }); + + it('carries the collection name, default sorting field and synonym sets', () => { + expect(collection.name).toBe('datasets'); + expect(collection.default_sorting_field).toBe('statusRank'); + expect(collection.synonym_sets).toEqual(['dataset-synonyms']); + }); + + it('fans a localized text field into display, per-locale stemmed search and sort keys', () => { + expect(collection.fields).toContainEqual({ + name: 'title_nl', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_en', + type: 'string', + index: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_nl', + type: 'string', + optional: true, + stem: true, + locale: 'nl', + }); + expect(collection.fields).toContainEqual({ + name: 'title_search_en', + type: 'string', + optional: true, + stem: true, + locale: 'en', + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_nl', + type: 'string', + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'title_sort_en', + type: 'string', + sort: true, + optional: true, + }); + }); + + it('maps keyword/reference/integer/boolean kinds to Typesense value fields', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + // `status` is required → non-optional, like the default sorting field. + expect(collection.fields).toContainEqual({ + name: 'status', + type: 'string', + facet: true, + sort: false, + optional: false, + }); + // statusRank is the default_sorting_field, which Typesense requires to be + // non-optional. + expect(collection.fields).toContainEqual({ + name: 'statusRank', + type: 'int64', + facet: false, + sort: true, + optional: false, + }); + expect(collection.fields).toContainEqual({ + name: 'size', + type: 'int64', + facet: true, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'iiif', + type: 'bool', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'publisher', + type: 'string[]', + facet: true, + sort: false, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'datePosted', + type: 'int64', + facet: false, + sort: true, + optional: true, + }); + expect(collection.fields).toContainEqual({ + name: 'score', + type: 'float', + facet: true, + sort: false, + optional: true, + }); + }); + + it('emits a folded, stemmed search companion for a searchable keyword field', () => { + expect(collection.fields).toContainEqual({ + name: 'keyword_search', + type: 'string[]', + optional: true, + stem: true, + locale: 'nl', + }); + }); + + it('emits the grouped-facet companion for a field that declares a group', () => { + expect(collection.fields).toContainEqual({ + name: 'format_group', + type: 'string[]', + facet: true, + optional: true, + }); + }); +}); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts new file mode 100644 index 00000000..2383ecde --- /dev/null +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -0,0 +1,66 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. The + * derived Typesense collection is snapshotted purely to pin the **generator**: + * any change to how `buildCollectionSchema` maps the field model (Typesense field + * types, the physical fanout, stem/locale, optional/default-sorting-field, group + * companions) surfaces as a snapshot diff before this library is published. + */ +const THING: SearchSchema = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, required: true }, + { name: 'size', kind: 'integer', facetable: true, sortable: true }, + { name: 'score', kind: 'number', facetable: true }, + { name: 'created', kind: 'date', sortable: true }, + { name: 'open', kind: 'boolean', facetable: true }, + ], +}; + +describe('collection-schema generator stability', () => { + it('derives a stable Typesense collection for a representative schema', () => { + expect( + buildCollectionSchema(THING, { + name: 'things', + defaultSortingField: 'size', + defaultLocale: 'nl', + synonymSets: ['things-synonyms'], + }), + ).toMatchSnapshot(); + }); +}); diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts new file mode 100644 index 00000000..50e601a4 --- /dev/null +++ b/packages/search-typesense/test/parse-response.test.ts @@ -0,0 +1,145 @@ +import { describe, expect, it } from 'vitest'; +import type { LocalizedValue, SearchSchema } from '@lde/search'; +import { parseSearchResponse } from '../src/search.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'size', kind: 'integer', output: true }, + { name: 'datePosted', kind: 'date', output: true }, + { name: 'iiif', kind: 'boolean', facetable: true, output: true }, + // A non-output field is never reconstructed into the logical document. + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + ], +}; + +const labels = new Map([ + ['https://org/1', { nl: ['Het Utrechts Archief'] }], + ['https://org/2', { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }], +]); + +const response = { + found: 2, + hits: [ + { + document: { + id: 'https://d/1', + title_nl: 'Titel', + title_en: 'Title', + keyword: ['kaarten'], + publisher: ['https://org/1'], + size: 1234, + datePosted: 1_700_000_000, + iiif: true, + status: 'valid', + }, + }, + { + document: { + id: 'https://d/2', + title_nl: 'Andere', + keyword: ['atlas', 'kaart'], + publisher: ['https://org/2', 'https://org/3'], + }, + }, + ], + facet_counts: [ + { + field_name: 'keyword', + counts: [ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ], + }, + { + // A reference facet: buckets are keyed by IRI and carry resolved labels. + field_name: 'publisher', + counts: [ + { value: 'https://org/1', count: 2 }, + { value: 'https://org/3', count: 1 }, + ], + }, + ], +}; + +describe('parseSearchResponse', () => { + const result = parseSearchResponse(response, schema, labels); + + it('carries the total and the facet buckets keyed by field name', () => { + expect(result.total).toBe(2); + // A plain facet: buckets carry no label. + expect(result.facets.keyword).toEqual([ + { value: 'kaarten', count: 3 }, + { value: 'atlas', count: 1 }, + ]); + }); + + it('attaches resolved labels to reference-facet buckets, id-only when unlabelled', () => { + expect(result.facets.publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { value: 'https://org/3', count: 1 }, + ]); + }); + + it('reconstructs localized text into a best-available language map', () => { + expect(result.hits[0].id).toBe('https://d/1'); + expect(result.hits[0].document.title).toEqual({ + nl: ['Titel'], + en: ['Title'], + }); + // Only the present locale is emitted. + expect(result.hits[1].document.title).toEqual({ nl: ['Andere'] }); + }); + + it('resolves reference IRIs to labelled references, id-only when unlabelled', () => { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + { id: 'https://org/3' }, + ]); + }); + + it('passes keyword arrays and numeric scalars through, and omits absent fields', () => { + expect(result.hits[0].document.keyword).toEqual(['kaarten']); + expect(result.hits[0].document.size).toBe(1234); + expect(result.hits[0].document.datePosted).toBe(1_700_000_000); + expect(result.hits[1].document.size).toBeUndefined(); + }); + + it('defaults an absent boolean to false and never reconstructs non-output fields', () => { + expect(result.hits[0].document.iiif).toBe(true); + expect(result.hits[1].document.iiif).toBe(false); + expect(result.hits[0].document.status).toBeUndefined(); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts new file mode 100644 index 00000000..acdd9f7a --- /dev/null +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -0,0 +1,156 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchQuery, SearchSchema } from '@lde/search'; +import { buildSearchParams } from '../src/query-compiler.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + path: 'http://purl.org/dc/terms/title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + path: 'http://www.w3.org/ns/dcat#keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + }, + { + name: 'format', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + group: { name: 'format_group', prefix: 'group:' }, + }, + // Filter-only, non-facet (tokenized) → exact `:=` membership. + { name: 'catalog', kind: 'keyword', array: true, filterable: true }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'size', kind: 'integer', filterable: true, sortable: true }, + { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, + ], +}; + +const base: SearchQuery = { + where: [], + orderBy: [], + limit: 20, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('buildSearchParams', () => { + it('browses with a match-all q and the weighted query_by fields', () => { + const params = buildSearchParams(base, schema); + expect(params.q).toBe('*'); + expect(params.query_by).toBe( + 'title_search_nl,title_search_en,keyword_search', + ); + expect(params.per_page).toBe(20); + expect(params.page).toBe(1); + expect(params.filter_by).toBeUndefined(); + expect(params.sort_by).toBeUndefined(); + }); + + it('folds the query text and boosts the active locale in query_by_weights', () => { + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'nl' }, schema), + ).toMatchObject({ q: 'kaart', query_by_weights: '5,4,1' }); + expect( + buildSearchParams({ ...base, text: 'Kaart', locale: 'en' }, schema) + .query_by_weights, + ).toBe('4,5,1'); + }); + + it('maps offset/limit to numbered pages', () => { + expect( + buildSearchParams({ ...base, offset: 40, limit: 20 }, schema).page, + ).toBe(3); + }); + + it('compiles where clauses, with exact membership for non-facet fields and grouped OR', () => { + const params = buildSearchParams( + { + ...base, + where: [ + { field: 'status', in: ['valid'] }, + { field: 'keyword', in: ['kaarten', 'atlas'] }, + { field: 'catalog', in: ['urn:cat'] }, + { field: 'format', in: ['text/turtle', 'group:rdf'] }, + { field: 'size', range: { min: 1, max: 10 } }, + { field: 'iiif', is: true }, + ], + }, + schema, + ); + expect(params.filter_by).toBe( + 'status:[`valid`] && ' + + 'keyword:[`kaarten`,`atlas`] && ' + + 'catalog:=[`urn:cat`] && ' + + '(format:[`text/turtle`] || format_group:[`group:rdf`]) && ' + + 'size:[1..10] && ' + + 'iiif:=true', + ); + }); + + it('compiles a one-sided range bound', () => { + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { min: 5 } }] }, + schema, + ).filter_by, + ).toBe('size:>=5'); + expect( + buildSearchParams( + { ...base, where: [{ field: 'size', range: { max: 9 } }] }, + schema, + ).filter_by, + ).toBe('size:<=9'); + }); + + it('compiles orderBy: RELEVANCE → _text_match and a localized field → its sort key', () => { + expect( + buildSearchParams( + { + ...base, + orderBy: [ + { field: 'relevance', direction: 'desc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('_text_match:desc,status_rank:asc'); + + expect( + buildSearchParams( + { + ...base, + locale: 'nl', + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'status_rank', direction: 'asc' }, + ], + }, + schema, + ).sort_by, + ).toBe('title_sort_nl:asc,status_rank:asc'); + }); + + it('requests facets by their logical field name', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) + .facet_by, + ).toBe('keyword,format'); + }); +}); diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts new file mode 100644 index 00000000..3a392f8a --- /dev/null +++ b/packages/search-typesense/test/search-engine.test.ts @@ -0,0 +1,226 @@ +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import type { Client } from 'typesense'; +import type { SearchEngine, SearchQuery, SearchSchema } from '@lde/search'; +import { buildCollectionSchema } from '../src/collection-schema.js'; +import { createTypesenseSearchEngine } from '../src/search.js'; +import { TypesenseContainer } from './typesense-container.js'; + +const datasetSchema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'http://xmlns.com/foaf/0.1/Agent', strategy: 'labelOnly' }, + }, + { name: 'status', kind: 'keyword', facetable: true, filterable: true }, + { name: 'statusRank', kind: 'integer', sortable: true }, + ], +}; + +// Flat documents, as the projection would emit them (physical field names). +const documents = [ + { + id: 'd1', + title_nl: 'Kaart van Utrecht', + title_en: 'Map of Utrecht', + title_search_nl: 'kaart van utrecht', + title_search_en: 'map of utrecht', + title_sort_nl: 'kaart van utrecht', + title_sort_en: 'map of utrecht', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd2', + title_nl: 'Atlas der Nederlanden', + title_search_nl: 'atlas der nederlanden', + title_sort_nl: 'atlas der nederlanden', + keyword: ['atlas'], + keyword_search: ['atlas'], + publisher: ['https://org/2'], + status: 'valid', + statusRank: 0, + }, + { + id: 'd3', + title_nl: 'Verouderde kaart', + title_search_nl: 'verouderde kaart', + title_sort_nl: 'verouderde kaart', + keyword: ['kaarten'], + keyword_search: ['kaarten'], + publisher: ['https://org/1'], + status: 'invalid', + statusRank: 3, + }, +]; + +const labelDocuments = [ + { + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + type: 'organization', + }, + { + id: 'https://org/2', + label: 'Rijksmuseum', + label_nl: 'Rijksmuseum', + label_en: 'Rijksmuseum', + type: 'organization', + }, +]; + +const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', +}; + +describe('createTypesenseSearchEngine (integration)', () => { + const container = new TypesenseContainer(); + let client: Client; + let engine: SearchEngine; + + beforeAll(async () => { + client = await container.start(); + // Typesense accepts the generated schema (stemming, locales, int64, …). + await client.collections().create( + buildCollectionSchema(datasetSchema, { + name: 'datasets', + defaultSortingField: 'statusRank', + defaultLocale: 'nl', + }), + ); + await client.collections().create({ + name: 'labels', + fields: [ + { name: 'label', type: 'string' }, + { name: 'label_nl', type: 'string', optional: true, index: false }, + { name: 'label_en', type: 'string', optional: true, index: false }, + { name: 'type', type: 'string', facet: true }, + ], + }); + await client + .collections('datasets') + .documents() + .import(documents, { action: 'create' }); + await client + .collections('labels') + .documents() + .import(labelDocuments, { action: 'create' }); + + engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + }); + }, 120_000); + + afterAll(async () => { + await container.stop(); + }); + + it('filters by status, sorts by the localized title key, and resolves reference labels', async () => { + const result = await engine.search( + { + ...baseQuery, + where: [{ field: 'status', in: ['valid'] }], + orderBy: [ + { field: 'title', direction: 'asc' }, + { field: 'statusRank', direction: 'asc' }, + ], + }, + datasetSchema, + ); + + // d3 is invalid → filtered out; remaining two sorted by folded title. + expect(result.total).toBe(2); + expect(result.hits.map((hit) => hit.id)).toEqual(['d2', 'd1']); + expect(result.hits[0].document.title).toEqual({ + nl: ['Atlas der Nederlanden'], + }); + expect(result.hits[0].document.publisher).toEqual([ + { + id: 'https://org/2', + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + expect(result.hits[1].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + }); + + it('ranks a full-text query through the weighted query_by fields', async () => { + const result = await engine.search( + { + ...baseQuery, + text: 'Utrecht', + orderBy: [{ field: 'relevance', direction: 'desc' }], + }, + datasetSchema, + ); + + expect(result.hits[0].id).toBe('d1'); + expect(result.hits.map((hit) => hit.id)).not.toContain('d2'); + }); + + it('returns facet buckets with counts, labelling reference facets', async () => { + const result = await engine.search( + { ...baseQuery, facets: ['keyword', 'publisher'] }, + datasetSchema, + ); + + // Plain facet: value + count, no label. + const keyword = [...result.facets.keyword].sort( + (a, b) => b.count - a.count, + ); + expect(keyword).toEqual([ + { value: 'kaarten', count: 2 }, + { value: 'atlas', count: 1 }, + ]); + + // Reference facet: IRI-keyed buckets carry the resolved data label. + const publisher = [...result.facets.publisher].sort( + (a, b) => b.count - a.count, + ); + expect(publisher).toEqual([ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + { + value: 'https://org/2', + count: 1, + label: { nl: ['Rijksmuseum'], en: ['Rijksmuseum'] }, + }, + ]); + }); +}); diff --git a/packages/search-typesense/tsconfig.lib.json b/packages/search-typesense/tsconfig.lib.json index e7c2ce37..52ca4bb7 100644 --- a/packages/search-typesense/tsconfig.lib.json +++ b/packages/search-typesense/tsconfig.lib.json @@ -8,7 +8,10 @@ "types": ["node"] }, "include": ["src/**/*.ts"], - "references": [], + "references": [ + { "path": "../search/tsconfig.lib.json" }, + { "path": "../text-normalization/tsconfig.lib.json" } + ], "exclude": [ "vite.config.ts", "vite.config.mts", From 66969a2109ca6dd20146bdf46322dc2d50fc8014 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Sun, 28 Jun 2026 20:23:14 +0200 Subject: [PATCH 03/13] feat(search-api-graphql): add the runtime-configured GraphQL surface - buildSearchSchema builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen) - one generic resolver maps args to SearchQuery, calls the engine, and maps the result back - derive output, where, orderBy and facet types plus nullability from the field model - best-first Accept-Language output ordering; nullable facet label for reference facets - add printSearchSchema for a consumer SDL snapshot, plus a generator-stability snapshot --- .../0004-search-api-graphql-surface.md | 82 +++- packages/search-api-graphql/README.md | 55 +++ packages/search-api-graphql/eslint.config.mjs | 22 + packages/search-api-graphql/package.json | 32 ++ .../search-api-graphql/src/build-schema.ts | 445 ++++++++++++++++++ packages/search-api-graphql/src/index.ts | 7 + packages/search-api-graphql/src/language.ts | 47 ++ .../generator-stability.test.ts.snap | 106 +++++ .../test/build-schema.test.ts | 349 ++++++++++++++ .../test/generator-stability.test.ts | 97 ++++ packages/search-api-graphql/tsconfig.json | 13 + packages/search-api-graphql/tsconfig.lib.json | 26 + .../search-api-graphql/tsconfig.spec.json | 29 ++ packages/search-api-graphql/vite.config.ts | 21 + tsconfig.json | 3 + 15 files changed, 1322 insertions(+), 12 deletions(-) create mode 100644 packages/search-api-graphql/README.md create mode 100644 packages/search-api-graphql/eslint.config.mjs create mode 100644 packages/search-api-graphql/package.json create mode 100644 packages/search-api-graphql/src/build-schema.ts create mode 100644 packages/search-api-graphql/src/index.ts create mode 100644 packages/search-api-graphql/src/language.ts create mode 100644 packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap create mode 100644 packages/search-api-graphql/test/build-schema.test.ts create mode 100644 packages/search-api-graphql/test/generator-stability.test.ts create mode 100644 packages/search-api-graphql/tsconfig.json create mode 100644 packages/search-api-graphql/tsconfig.lib.json create mode 100644 packages/search-api-graphql/tsconfig.spec.json create mode 100644 packages/search-api-graphql/vite.config.ts diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index d6aff824..54c34000 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -29,7 +29,7 @@ that schema. A better name for the draft’s “generation” step, at least for **runtime configuration**. This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the adapter, and maps the result back; +resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; the field model only parameterises data. Codegen would emit N near-identical resolver stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. @@ -45,8 +45,10 @@ accidental breaking changes to the frozen contract – not a shipped artifact. ### The schema-building function ```ts -function buildSearchSchema( - schema: SearchSchema, +// Generic over the config *value’s* type (capture it `as const satisfies SearchSchema`), so +// one declaration drives both the runtime schema and the static TS types below. +function buildSearchSchema( + schema: S, options: { typeName: string; // 'Dataset' – drives all derived type names queryField?: string; // root field; default lowercased plural of typeName @@ -60,6 +62,13 @@ function buildSearchSchema( }, ): GraphQLSchema; // executable schema: types + generic resolvers attached +// Static types derived from the SAME config value’s type (compile-time only, erased at +// runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. +type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } +type WhereOf; // { format?: StringFilter; size?: FloatRange; … } +type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } +type FacetOf; // the facetable-field-name union + // also exported for manual composition / non-default servers: function buildSearchTypeDefsAndResolvers( schema, @@ -74,6 +83,38 @@ function printSearchSchema(schema, options): string; // SDL, for a snapshot/brea `extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or composes the exported typeDefs/resolvers by hand. +### A typed surface the contract does not depend on + +Because `buildSearchSchema` is generic over the config _value_ (``), one +`as const satisfies SearchSchema` declaration drives two **independent** projections: + +- **the runtime contract** – the `GraphQLSchema`, built at startup by reading the value + (`field.kind`, `output`, `facetable`, …); and +- **a static TS mirror** – `OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, + computed from `typeof schema` via mapped types. + +The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time +only and TS types are erased, so the served schema is byte-identical whether or not the +mirror types exist – they are a developer-experience overlay, never the source. The two are +parallel derivations of one value: the runtime kind→GraphQL-type mapping lives in +`buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it. They can drift, +so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot test (the +real artifact), while the TS mirror only catches our own coding mistakes against it. + +Values are typed at both ends, with the resolver as the typed transform between them: + +| layer | localized text | reference | int64 | keyword (array) | boolean | +| ----------------------- | ------------------------------------ | --------------------------- | ---------------- | ----------------------- | -------------------- | +| IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | +| GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | + +What stays unchecked is only the **generic resolver’s dynamic middle**: it loops over the +field model with runtime-string names, so TS cannot prove the object it builds matches +`OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the +output types at runtime (a wrong-typed return raises a field error). This is the same +“typed boundaries, dynamic middle” shape as the engine port and the projection: type the +edges where it is honest, accept a cast where iteration is inherently dynamic. + ### Construction rules (field model → schema) Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, @@ -81,9 +122,12 @@ Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `Fac GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); - `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int`; `number` → `Float`; - `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → - see below. Nullability from `array` / required / optional; `id` is `String!`. + `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); + `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → + `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / + optional; `id` is `String!`. A field whose magnitude can exceed 32 bits (a 64-bit count or + byte size – e.g. DR’s `size`) is modelled as `number` → `Float`, since GraphQL’s `Int` + would overflow; a `Long`/`BigInt` custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -113,6 +157,14 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, potentially breaking change – not a free one. - **Facets** – an enum of every `facetable` field; requested per query, returned with counts. + A bucket’s `value` is its selection key; `label` is the **nullable** display label. + The engine resolves `label` only for **reference** facets — IRI-keyed buckets whose + canonical multilingual label is _data_, fetched from the sidecar `labels` collection in the + same lookup as hit references. It is `null` for token facets (e.g. `status`) and + free-string facets (e.g. `keyword`): those carry no data label, and the consumer owns their + display — its own i18n catalog for controlled tokens (`valid` → “Geldig”/“Valid”, which the + engine cannot and must not fabricate), or the `value` itself for free strings. The null is + load-bearing: it tells a client whether a server-resolved label exists or display is theirs. ### Resulting schema (DR example, abridged) @@ -138,7 +190,7 @@ type Dataset { terminologySource: [Term!]! format: [String!]! class: [String!]! - size: Int + size: Float # int64 magnitude → Float, not Int (32-bit); see note below datePosted: String status: String iiif: Boolean! @@ -152,6 +204,10 @@ input IntRange { min: Int max: Int } +input FloatRange { + min: Float + max: Float +} input DateRange { min: String max: String @@ -162,7 +218,7 @@ input DatasetWhere { format: StringFilter class: StringFilter status: StringFilter - size: IntRange + size: FloatRange datePosted: DateRange iiif: Boolean # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris @@ -198,8 +254,9 @@ enum DatasetFacetField { PERSISTENT_URIS } type FacetBucket { - value: String! + value: String! # the selection key (an IRI for reference facets, else a token/string) count: Int! + label: [LanguageString!] # nullable — see below } type Facet { field: DatasetFacetField! @@ -244,7 +301,7 @@ The single, generic root resolver (shipped in the package, not emitted): 2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is present else `title`; and the `status_rank` tie-break appended to either. -3. **`context.adapter.search(query, schema)` → `SearchResult`.** +3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. @@ -271,7 +328,7 @@ then untagged (`und`) last – so `[0]` is always the best available value. ```ts interface SearchContext { - adapter: SearchAdapter; // any engine + engine: SearchEngine; // the port; any engine adapter acceptLanguage: readonly string[]; // parsed, ordered; drives locale + output ordering } ``` @@ -296,6 +353,7 @@ Each transport populates it per request; no framework type appears in the packag additive `inline` growth. - Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`); transport-layer persisted queries / cost + pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit + integers (kept `Float`); transport-layer persisted queries / cost limits; a root or per-field language argument (Accept-Language is the sole preference mechanism); metadata-language-availability filtering (a facetable dimension, not v1). diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md new file mode 100644 index 00000000..88f8cdb3 --- /dev/null +++ b/packages/search-api-graphql/README.md @@ -0,0 +1,55 @@ +# @lde/search-api-graphql + +The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and +domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchSchema` +at runtime, and serves it with one generic resolver over any `SearchEngine`. It +names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, +`CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it +[`@lde/search-typesense`](../search-typesense) or another adapter). + +## Runtime configuration, not codegen + +`buildSearchSchema(schema, { typeName })` constructs the schema once at startup +from the field model — no SDL artifact, no generated resolver stubs. The field +model is the single source; the GraphQL contract is whatever it produces. Output +types, the `where`/`orderBy`/facet inputs, reference types and nullability are all +derived from each field’s `kind` and capability flags. + +```ts +import { buildSearchSchema } from '@lde/search-api-graphql'; + +const gqlSchema = buildSearchSchema(DATASET, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + }), +}); + +// Hand `gqlSchema` to any graphql-js server; populate the per-request context: +// { engine: SearchEngine, acceptLanguage: string[] } +``` + +## What it builds + +- **Output type** (`typeName`) — localized text → best-first `[LanguageString!]!` + (`[0].language` is the language actually served); references → named per-shape + types (`Organization`, `Term`) with a `name`; scalars/booleans per kind; `date` + → ISO 8601 string; nullability from `required` / `array` / `kind`. +- **`where`** — one input per `filterable` field (`StringFilter`, `IntRange` / + `FloatRange` / `DateRange`, or `Boolean`). +- **`orderBy`** — `RELEVANCE` plus every `sortable` field, as an enum. +- **Facets** — an enum of every `facetable` field; a bucket carries `value` + + `count` + a nullable `label` — the resolved data label for **reference** facets, + `null` for token/free-string facets whose display the consumer owns (its own + i18n, or the value itself). + +## Why it can’t drift + +The surface reads the same field model the index is built from, and compiles into +the same neutral `SearchQuery` the engine consumes — so the API, the index and a +future REST surface stay in lockstep. The contract is **frozen** (breaking to +change), and because it is generated rather than hand-written, a _consumer_ guards +it with a `printSearchSchema(schema, options)` SDL snapshot over its **own** +schema and `typeName` — that snapshot also catches a `buildSearchSchema` change in +a future version of this library silently altering the consumer’s contract. diff --git a/packages/search-api-graphql/eslint.config.mjs b/packages/search-api-graphql/eslint.config.mjs new file mode 100644 index 00000000..2dcaf60c --- /dev/null +++ b/packages/search-api-graphql/eslint.config.mjs @@ -0,0 +1,22 @@ +import baseConfig from '../../eslint.config.mjs'; + +export default [ + ...baseConfig, + { + files: ['**/*.json'], + rules: { + '@nx/dependency-checks': [ + 'error', + { + ignoredFiles: [ + '{projectRoot}/eslint.config.{js,cjs,mjs}', + '{projectRoot}/vite.config.{js,ts,mjs,mts}', + ], + }, + ], + }, + languageOptions: { + parser: await import('jsonc-eslint-parser'), + }, + }, +]; diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json new file mode 100644 index 00000000..ea761b48 --- /dev/null +++ b/packages/search-api-graphql/package.json @@ -0,0 +1,32 @@ +{ + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the schema and typeName; it names neither your domain nor your engine.", + "repository": { + "url": "git+https://github.com/ldelements/lde.git", + "directory": "packages/search-api-graphql" + }, + "license": "MIT", + "type": "module", + "exports": { + "./package.json": "./package.json", + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js", + "development": "./src/index.ts", + "default": "./dist/index.js" + } + }, + "main": "./dist/index.js", + "module": "./dist/index.js", + "types": "./dist/index.d.ts", + "files": [ + "dist", + "!**/*.tsbuildinfo" + ], + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } +} diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts new file mode 100644 index 00000000..fdfccf09 --- /dev/null +++ b/packages/search-api-graphql/src/build-schema.ts @@ -0,0 +1,445 @@ +import { + GraphQLBoolean, + GraphQLEnumType, + GraphQLFloat, + GraphQLInputObjectType, + GraphQLInt, + GraphQLList, + GraphQLNonNull, + GraphQLObjectType, + GraphQLSchema, + GraphQLString, + printSchema, + type GraphQLEnumValueConfigMap, + type GraphQLFieldConfig, + type GraphQLInputFieldConfig, + type GraphQLInputType, + type GraphQLOutputType, +} from 'graphql'; +import { + facetableFields, + filterableFields, + filterOperatorFor, + outputFields, + sortableFields, + type Filter, + type LocalizedValue, + type Reference, + type SearchEngine, + type SearchField, + type SearchQuery, + type SearchSchema, + type Sort, +} from '@lde/search'; +import { + defaultLanguageOrder, + toLanguageStrings, + type LanguageOrder, +} from './language.js'; + +/** Populated per request by the transport; no framework type appears here. */ +export interface SearchContext { + readonly engine: SearchEngine; + /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ + readonly acceptLanguage: readonly string[]; +} + +export interface BuildSearchSchemaOptions { + /** Drives all derived type names, e.g. `Dataset`. */ + readonly typeName: string; + /** Root query field; defaults to the lowercased plural of `typeName`. */ + readonly queryField?: string; + /** Consumer policy applied to every query (default status, sort, tie-breaks). */ + readonly queryDefaults?: ( + query: SearchQuery, + context: SearchContext, + ) => SearchQuery; + /** Output-language ordering; defaults to Accept-Language-first, `und` last. */ + readonly languageOrder?: LanguageOrder; +} + +type Source = Record; + +const nonNullListOf = (type: GraphQLOutputType): GraphQLOutputType => + new GraphQLNonNull(new GraphQLList(new GraphQLNonNull(type))); + +const scalarOutput = ( + scalar: GraphQLOutputType, + field: SearchField, +): GraphQLOutputType => + field.required === true ? new GraphQLNonNull(scalar) : scalar; + +/** SCREAMING_SNAKE_CASE for an enum value name, e.g. `datePosted` → `DATE_POSTED`. */ +function screamingSnake(name: string): string { + return name.replace(/([a-z0-9])([A-Z])/g, '$1_$2').toUpperCase(); +} + +/** + * Construct an executable GraphQL schema from the unified {@link SearchField} + * model at runtime — no codegen, no SDL artifact. One generic resolver maps the + * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result + * back; the field model only parameterises data. + */ +export function buildSearchSchema( + schema: SearchSchema, + options: BuildSearchSchemaOptions, +): GraphQLSchema { + const { typeName } = options; + const languageOrder = options.languageOrder ?? defaultLanguageOrder; + const queryField = + options.queryField ?? + `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; + + // --- Shared types --- + const languageString = new GraphQLObjectType({ + name: 'LanguageString', + fields: { + language: { type: GraphQLString }, + value: { type: new GraphQLNonNull(GraphQLString) }, + }, + }); + const facetBucket = new GraphQLObjectType({ + name: 'FacetBucket', + fields: { + value: { type: new GraphQLNonNull(GraphQLString) }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + // Nullable: the resolved data label for a reference facet, else null — + // the consumer owns display for token/free-string facets (its i18n or the + // value itself). + label: { + type: new GraphQLList(new GraphQLNonNull(languageString)), + resolve: (bucket: Source, _args: unknown, context: SearchContext) => { + const label = bucket.label as LocalizedValue | undefined; + return label + ? toLanguageStrings(label, context.acceptLanguage, languageOrder) + : null; + }, + }, + }, + }); + const sortDirection = new GraphQLEnumType({ + name: 'SortDirection', + values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, + }); + const stringFilter = new GraphQLInputObjectType({ + name: 'StringFilter', + fields: { + in: { type: new GraphQLList(new GraphQLNonNull(GraphQLString)) }, + }, + }); + const intRange = rangeInput('IntRange', GraphQLInt); + const floatRange = rangeInput('FloatRange', GraphQLFloat); + const dateRange = rangeInput('DateRange', GraphQLString); + + const labelList = ( + resolveLabel: (source: Source) => LocalizedValue | undefined, + ) => ({ + type: nonNullListOf(languageString), + resolve: (source: Source, _args: unknown, context: SearchContext) => { + const value = resolveLabel(source); + return value + ? toLanguageStrings(value, context.acceptLanguage, languageOrder) + : []; + }, + }); + + // --- Reference types, one per referenced shape, reused by every field. --- + const referenceTypes = new Map(); + for (const field of outputFields(schema)) { + if ( + field.kind === 'reference' && + field.ref && + !referenceTypes.has(field.ref.type) + ) { + referenceTypes.set( + field.ref.type, + new GraphQLObjectType({ + name: field.ref.type, + fields: { + id: { + type: new GraphQLNonNull(GraphQLString), + resolve: (source: Source) => (source as unknown as Reference).id, + }, + name: labelList((source) => (source as unknown as Reference).label), + }, + }), + ); + } + } + + // --- Output type --- + const outputType = new GraphQLObjectType({ + name: typeName, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = { + id: { type: new GraphQLNonNull(GraphQLString) }, + }; + for (const field of outputFields(schema)) { + fields[field.name] = outputFieldConfig(field); + } + return fields; + }, + }); + + function outputFieldConfig( + field: SearchField, + ): GraphQLFieldConfig { + const passthrough = (source: Source) => source[field.name] ?? null; + switch (field.kind) { + case 'text': + return labelList( + (source) => source[field.name] as LocalizedValue | undefined, + ); + case 'keyword': + return field.array === true + ? { + type: nonNullListOf(GraphQLString), + resolve: (s) => s[field.name] ?? [], + } + : { type: scalarOutput(GraphQLString, field), resolve: passthrough }; + case 'reference': { + const referenceType = referenceTypes.get(field.ref?.type ?? '')!; + return field.array === true + ? { + type: nonNullListOf(referenceType), + resolve: (s) => s[field.name] ?? [], + } + : { + type: + field.required === true + ? new GraphQLNonNull(referenceType) + : referenceType, + resolve: passthrough, + }; + } + case 'integer': + return { type: scalarOutput(GraphQLInt, field), resolve: passthrough }; + case 'number': + return { + type: scalarOutput(GraphQLFloat, field), + resolve: passthrough, + }; + case 'date': + // Stored as Unix seconds (int64); the surface serves ISO 8601 (ADR 4). + return { + type: scalarOutput(GraphQLString, field), + resolve: (source) => { + const value = source[field.name]; + return typeof value === 'number' + ? new Date(value * 1000).toISOString() + : (value ?? null); + }, + }; + case 'boolean': + return { + type: new GraphQLNonNull(GraphQLBoolean), + resolve: (source) => source[field.name] === true, + }; + } + } + + // --- where / orderBy / facets --- + const whereInput = new GraphQLInputObjectType({ + name: `${typeName}Where`, + fields: () => { + const fields: Record = {}; + for (const field of filterableFields(schema)) { + fields[field.name] = { type: whereFieldType(field) }; + } + return fields; + }, + }); + + function whereFieldType(field: SearchField): GraphQLInputType { + switch (filterOperatorFor(field.kind)) { + case 'in': + return stringFilter; + case 'range': + return field.kind === 'integer' + ? intRange + : field.kind === 'number' + ? floatRange + : dateRange; + default: + return GraphQLBoolean; + } + } + + const sortValues: GraphQLEnumValueConfigMap = { + RELEVANCE: { value: 'relevance' }, + }; + for (const field of sortableFields(schema)) { + sortValues[screamingSnake(field.name)] = { value: field.name }; + } + const sortField = new GraphQLEnumType({ + name: `${typeName}SortField`, + values: sortValues, + }); + const orderByInput = new GraphQLInputObjectType({ + name: `${typeName}OrderBy`, + fields: { + field: { type: new GraphQLNonNull(sortField) }, + direction: { + type: new GraphQLNonNull(sortDirection), + defaultValue: 'desc', + }, + }, + }); + + const facetValues: GraphQLEnumValueConfigMap = {}; + for (const field of facetableFields(schema)) { + facetValues[screamingSnake(field.name)] = { value: field.name }; + } + const facetField = new GraphQLEnumType({ + name: `${typeName}FacetField`, + values: facetValues, + }); + const facet = new GraphQLObjectType({ + name: 'Facet', + fields: { + field: { type: new GraphQLNonNull(facetField) }, + buckets: { type: nonNullListOf(facetBucket) }, + }, + }); + + const resultType = new GraphQLObjectType({ + name: `${typeName}SearchResult`, + fields: { + items: { type: nonNullListOf(outputType) }, + total: { type: new GraphQLNonNull(GraphQLInt) }, + page: { type: new GraphQLNonNull(GraphQLInt) }, + perPage: { type: new GraphQLNonNull(GraphQLInt) }, + facets: { type: nonNullListOf(facet) }, + }, + }); + + const query = new GraphQLObjectType({ + name: 'Query', + fields: { + [queryField]: { + type: new GraphQLNonNull(resultType), + args: { + query: { type: GraphQLString }, + where: { type: whereInput }, + orderBy: { type: orderByInput }, + page: { type: GraphQLInt, defaultValue: 1 }, + perPage: { type: GraphQLInt, defaultValue: 20 }, + facets: { type: new GraphQLList(new GraphQLNonNull(facetField)) }, + }, + resolve: async (_source, args, context: SearchContext) => { + const built = argsToQuery(args as QueryArgs, context, schema); + const finalQuery = options.queryDefaults + ? options.queryDefaults(built, context) + : built; + const result = await context.engine.search(finalQuery, schema); + return { + items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), + total: result.total, + page: Math.floor(finalQuery.offset / finalQuery.limit) + 1, + perPage: finalQuery.limit, + facets: Object.entries(result.facets).map(([field, buckets]) => ({ + field, + buckets, + })), + }; + }, + }, + }, + }); + + return new GraphQLSchema({ query }); +} + +/** + * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an + * optional CI snapshot test over its own schema, catching accidental breaking + * changes to its frozen contract (including a `buildSearchSchema` change in a + * future version of this library silently altering it). + */ +export function printSearchSchema( + schema: SearchSchema, + options: BuildSearchSchemaOptions, +): string { + return printSchema(buildSearchSchema(schema, options)); +} + +interface QueryArgs { + readonly query?: string; + readonly where?: Record; + readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; + readonly page?: number; + readonly perPage?: number; + readonly facets?: readonly string[]; +} + +/** Pure args → {@link SearchQuery} mapping. */ +function argsToQuery( + args: QueryArgs, + context: SearchContext, + schema: SearchSchema, +): SearchQuery { + const perPage = args.perPage ?? 20; + const page = args.page ?? 1; + return { + text: args.query, + where: whereToFilters(args.where, schema), + orderBy: args.orderBy + ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] + : [], + limit: perPage, + offset: (page - 1) * perPage, + facets: args.facets ?? [], + locale: context.acceptLanguage[0] ?? 'und', + }; +} + +function whereToFilters( + where: Record | undefined, + schema: SearchSchema, +): Filter[] { + if (where === undefined) { + return []; + } + const filters: Filter[] = []; + for (const field of filterableFields(schema)) { + const value = where[field.name]; + if (value === undefined || value === null) { + continue; + } + switch (filterOperatorFor(field.kind)) { + case 'in': + filters.push({ + field: field.name, + in: (value as { in?: string[] }).in ?? [], + }); + break; + case 'range': { + const range = value as { min?: number | string; max?: number | string }; + filters.push({ + field: field.name, + range: { min: range.min, max: range.max }, + }); + break; + } + default: + filters.push({ field: field.name, is: value as boolean }); + } + } + return filters; +} + +function rangeInput( + name: string, + bound: typeof GraphQLInt | typeof GraphQLFloat | typeof GraphQLString, +): GraphQLInputObjectType { + return new GraphQLInputObjectType({ + name, + fields: { min: { type: bound }, max: { type: bound } }, + }); +} + +// Re-exported for callers that compose a sort manually. +export type { Sort }; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts new file mode 100644 index 00000000..2fe7db46 --- /dev/null +++ b/packages/search-api-graphql/src/index.ts @@ -0,0 +1,7 @@ +export { buildSearchSchema, printSearchSchema } from './build-schema.js'; +export type { + SearchContext, + BuildSearchSchemaOptions, +} from './build-schema.js'; +export { defaultLanguageOrder, toLanguageStrings } from './language.js'; +export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/src/language.ts b/packages/search-api-graphql/src/language.ts new file mode 100644 index 00000000..96826f65 --- /dev/null +++ b/packages/search-api-graphql/src/language.ts @@ -0,0 +1,47 @@ +import type { LocalizedValue } from '@lde/search'; + +/** One entry of the surface’s best-first `[LanguageString!]!`. `language` is null + * for untagged (`und`) values; `[0]` is the value to display and `[0].language` + * is the language actually served (the per-field `Content-Language`). */ +export interface LanguageString { + readonly language: string | null; + readonly value: string; +} + +/** Orders a localized value’s available languages against the request. */ +export type LanguageOrder = ( + available: readonly string[], + accept: readonly string[], +) => readonly string[]; + +/** + * Default ordering: requested languages first (in request order), then the + * remaining tagged languages, then untagged (`und`) last — so `[0]` is always the + * best available value. + */ +export const defaultLanguageOrder: LanguageOrder = (available, accept) => { + const requested = accept.filter((language) => available.includes(language)); + const rest = available.filter( + (language) => language !== 'und' && !requested.includes(language), + ); + const untagged = available.includes('und') ? ['und'] : []; + return [...requested, ...rest, ...untagged]; +}; + +/** Flatten a language map into a best-first `LanguageString` list. */ +export function toLanguageStrings( + value: LocalizedValue, + accept: readonly string[], + order: LanguageOrder, +): LanguageString[] { + const result: LanguageString[] = []; + for (const language of order(Object.keys(value), accept)) { + for (const text of value[language] ?? []) { + result.push({ + language: language === 'und' ? null : language, + value: text, + }); + } + } + return result; +} diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap new file mode 100644 index 00000000..d1741f4e --- /dev/null +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -0,0 +1,106 @@ +// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html + +exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` +"type Query { + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20, facets: [ThingFacetField!]): ThingSearchResult! +} + +type ThingSearchResult { + items: [Thing!]! + total: Int! + page: Int! + perPage: Int! + facets: [Facet!]! +} + +type Thing { + id: String! + title: [LanguageString!]! + description: [LanguageString!]! + keyword: [String!]! + creator: [Agent!]! + publisher: Agent + size: Int + score: Float + created: String + status: String! + open: Boolean! +} + +type LanguageString { + language: String + value: String! +} + +type Agent { + id: String! + name: [LanguageString!]! +} + +type Facet { + field: ThingFacetField! + buckets: [FacetBucket!]! +} + +enum ThingFacetField { + KEYWORD + CREATOR + PUBLISHER + STATUS + OPEN +} + +type FacetBucket { + value: String! + count: Int! + label: [LanguageString!] +} + +input ThingWhere { + keyword: StringFilter + creator: StringFilter + publisher: StringFilter + size: IntRange + score: FloatRange + created: DateRange + status: StringFilter + open: Boolean +} + +input StringFilter { + in: [String!] +} + +input IntRange { + min: Int + max: Int +} + +input FloatRange { + min: Float + max: Float +} + +input DateRange { + min: String + max: String +} + +input ThingOrderBy { + field: ThingSortField! + direction: SortDirection! = DESC +} + +enum ThingSortField { + RELEVANCE + TITLE + SIZE + CREATED +} + +enum SortDirection { + ASC + DESC +} +" +`; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts new file mode 100644 index 00000000..b61ba240 --- /dev/null +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -0,0 +1,349 @@ +import { describe, expect, it } from 'vitest'; +import { graphql, printSchema } from 'graphql'; +import type { + SearchEngine, + SearchQuery, + SearchResult, + SearchSchema, +} from '@lde/search'; +import { buildSearchSchema, type SearchContext } from '../src/build-schema.js'; + +const schema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + output: true, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Organization', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'datePosted', kind: 'date', sortable: true, output: true }, + { name: 'score', kind: 'number', output: true }, + { + name: 'terminologySource', + kind: 'reference', + array: true, + facetable: true, + output: true, + ref: { type: 'Term', strategy: 'labelOnly' }, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'iiif', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +/** A fake engine that records the query it received and returns a canned result. */ +function fakeEngine(result: SearchResult): { + engine: SearchEngine; + received: () => SearchQuery; +} { + let captured: SearchQuery; + return { + engine: { + async search(query) { + captured = query; + return result; + }, + }, + received: () => captured, + }; +} + +const canned: SearchResult = { + total: 1, + hits: [ + { + id: 'https://d/1', + document: { + title: { nl: ['Titel'], en: ['Title'] }, + keyword: ['kaarten'], + publisher: { + id: 'https://org/1', + label: { nl: ['Het Utrechts Archief'] }, + }, + size: 1234, + datePosted: 1_700_000_000, + score: 4.5, + terminologySource: [ + { id: 'https://term/1', label: { nl: ['Kaarten'] } }, + ], + status: 'valid', + iiif: true, + }, + }, + ], + facets: { keyword: [{ value: 'kaarten', count: 3 }] }, +}; + +async function run( + source: string, + context: SearchContext, + variables?: Record, +) { + return graphql({ + schema: buildSearchSchema(schema, { typeName: 'Dataset' }), + source, + contextValue: context, + variableValues: variables, + }); +} + +describe('buildSearchSchema', () => { + it('resolves a query, mapping the result to the typed output', async () => { + const { engine, received } = fakeEngine(canned); + const result = await run( + `{ + datasets(query: "kaart") { + total + page + perPage + items { + id + title { language value } + keyword + publisher { id name { language value } } + terminologySource { id name { language value } } + size + datePosted + score + status + iiif + } + facets { field buckets { value count } } + } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect(data.page).toBe(1); + const item = (data.items as Record[])[0]; + expect(item.id).toBe('https://d/1'); + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: 'en', value: 'Title' }, + ]); + expect(item.keyword).toEqual(['kaarten']); + expect(item.publisher).toEqual({ + id: 'https://org/1', + name: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }); + expect(item.size).toBe(1234); + expect(item.datePosted).toBe('2023-11-14T22:13:20.000Z'); + expect(item.score).toBe(4.5); + expect(item.terminologySource).toEqual([ + { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, + ]); + expect(item.iiif).toBe(true); + expect(data.facets).toEqual([ + { field: 'KEYWORD', buckets: [{ value: 'kaarten', count: 3 }] }, + ]); + // The free-text arg became the query text. + expect(received().text).toBe('kaart'); + }); + + it('orders the output list best-first for the requested language', async () => { + const { engine } = fakeEngine(canned); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'en', value: 'Title' }, + { language: 'nl', value: 'Titel' }, + ]); + }); + + it('places untagged (und) values last with a null language', async () => { + const { engine } = fakeEngine({ + total: 1, + facets: {}, + hits: [ + { + id: 'x', + document: { title: { nl: ['Titel'], und: ['Naamloos'] } }, + }, + ], + }); + const result = await run( + `{ datasets { items { title { language value } } } }`, + { engine, acceptLanguage: ['en'] }, + ); + const item = ( + (result.data?.datasets as Record).items as Record< + string, + unknown + >[] + )[0]; + expect(item.title).toEqual([ + { language: 'nl', value: 'Titel' }, + { language: null, value: 'Naamloos' }, + ]); + }); + + it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + publisher: [ + { + value: 'https://org/1', + count: 2, + label: { nl: ['Het Utrechts Archief'] }, + }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { field buckets { value count label { language value } } } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { field: string; buckets: unknown[] }[]; + const publisher = facets.find((facet) => facet.field === 'PUBLISHER'); + const keyword = facets.find((facet) => facet.field === 'KEYWORD'); + expect(publisher?.buckets).toEqual([ + { + value: 'https://org/1', + count: 2, + label: [{ language: 'nl', value: 'Het Utrechts Archief' }], + }, + ]); + expect(keyword?.buckets).toEqual([ + { value: 'kaarten', count: 3, label: null }, + ]); + }); + + it('maps where, orderBy, facets and pagination into the SearchQuery', async () => { + const { engine, received } = fakeEngine(canned); + await run( + `{ + datasets( + where: { status: { in: ["valid"] }, keyword: {}, size: { min: 1, max: 9 }, iiif: true } + orderBy: { field: SIZE, direction: ASC } + page: 3 + perPage: 10 + facets: [KEYWORD, PUBLISHER] + ) { total } + }`, + { engine, acceptLanguage: ['nl'] }, + ); + + const query = received(); + expect(query.where).toContainEqual({ field: 'status', in: ['valid'] }); + // An empty StringFilter compiles to an empty membership. + expect(query.where).toContainEqual({ field: 'keyword', in: [] }); + expect(query.where).toContainEqual({ + field: 'size', + range: { min: 1, max: 9 }, + }); + expect(query.where).toContainEqual({ field: 'iiif', is: true }); + expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); + expect(query.facets).toEqual(['keyword', 'publisher']); + expect(query.limit).toBe(10); + expect(query.offset).toBe(20); + }); + + it('falls back to the und locale when no Accept-Language is given', async () => { + const { engine, received } = fakeEngine(canned); + await run(`{ datasets { total } }`, { engine, acceptLanguage: [] }); + expect(received().locale).toBe('und'); + }); + + it('applies queryDefaults before calling the engine', async () => { + let captured: SearchQuery | undefined; + const engine: SearchEngine = { + async search(query) { + captured = query; + return canned; + }, + }; + const gqlSchema = buildSearchSchema(schema, { + typeName: 'Dataset', + queryDefaults: (query) => ({ + ...query, + where: [...query.where, { field: 'status', in: ['valid'] }], + orderBy: [{ field: 'relevance', direction: 'desc' }], + }), + }); + await graphql({ + schema: gqlSchema, + source: `{ datasets { total } }`, + contextValue: { engine, acceptLanguage: ['nl'] }, + }); + expect(captured?.where).toEqual([{ field: 'status', in: ['valid'] }]); + expect(captured?.orderBy).toEqual([ + { field: 'relevance', direction: 'desc' }, + ]); + }); + + it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { + const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + expect(sdl).toMatch(/status: String!/); // required + expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable + expect(sdl).toMatch(/title: \[LanguageString!\]!/); + expect(sdl).toMatch(/keyword: \[String!\]!/); + expect(sdl).toMatch(/iiif: Boolean!/); + expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference + }); + + it('builds the where, orderBy and facet enums from the field model', () => { + const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + expect(sdl).toMatch(/enum DatasetSortField/); + expect(sdl).toMatch(/RELEVANCE/); + expect(sdl).toMatch(/SIZE/); + expect(sdl).toMatch(/enum DatasetFacetField/); + expect(sdl).toMatch(/input DatasetWhere/); + expect(sdl).toMatch(/status: StringFilter/); + expect(sdl).toMatch(/size: IntRange/); + }); +}); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts new file mode 100644 index 00000000..78a86f40 --- /dev/null +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from 'vitest'; +import type { SearchSchema } from '@lde/search'; +import { printSearchSchema } from '../src/build-schema.js'; + +/** + * A neutral fixture exercising every kind + capability — NOT a real domain. Its + * SDL is snapshotted purely to pin the **generator**: any change to how + * `buildSearchSchema` maps the field model (nullability, type names, enums, + * reference reuse) surfaces as a snapshot diff before this library is published, + * so a consumer’s contract can’t shift from under it by accident. + */ +const THING: SearchSchema = { + type: 'https://example.org/Thing', + fields: [ + { + name: 'title', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 5 }, + sortable: true, + required: true, + }, + { + name: 'description', + kind: 'text', + localized: true, + locales: ['nl', 'en'], + output: true, + searchable: { weight: 2 }, + }, + { + name: 'keyword', + kind: 'keyword', + array: true, + facetable: true, + filterable: true, + searchable: { weight: 1 }, + output: true, + }, + // Two references sharing a shape → the Agent type is emitted once and reused. + { + name: 'creator', + kind: 'reference', + array: true, + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'publisher', + kind: 'reference', + facetable: true, + filterable: true, + output: true, + ref: { type: 'Agent', strategy: 'labelOnly' }, + }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + output: true, + }, + { name: 'score', kind: 'number', filterable: true, output: true }, + { + name: 'created', + kind: 'date', + filterable: true, + sortable: true, + output: true, + }, + { + name: 'status', + kind: 'keyword', + facetable: true, + filterable: true, + required: true, + output: true, + }, + { + name: 'open', + kind: 'boolean', + facetable: true, + filterable: true, + output: true, + }, + ], +}; + +describe('GraphQL generator stability', () => { + it('emits a stable SDL for a representative schema', () => { + expect(printSearchSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + }); +}); diff --git a/packages/search-api-graphql/tsconfig.json b/packages/search-api-graphql/tsconfig.json new file mode 100644 index 00000000..62ebbd94 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "../../tsconfig.base.json", + "files": [], + "include": [], + "references": [ + { + "path": "./tsconfig.lib.json" + }, + { + "path": "./tsconfig.spec.json" + } + ] +} diff --git a/packages/search-api-graphql/tsconfig.lib.json b/packages/search-api-graphql/tsconfig.lib.json new file mode 100644 index 00000000..64610bac --- /dev/null +++ b/packages/search-api-graphql/tsconfig.lib.json @@ -0,0 +1,26 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist", + "tsBuildInfoFile": "dist/tsconfig.lib.tsbuildinfo", + "emitDeclarationOnly": false, + "types": ["node"] + }, + "include": ["src/**/*.ts"], + "references": [{ "path": "../search/tsconfig.lib.json" }], + "exclude": [ + "vite.config.ts", + "vite.config.mts", + "vitest.config.ts", + "vitest.config.mts", + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx" + ] +} diff --git a/packages/search-api-graphql/tsconfig.spec.json b/packages/search-api-graphql/tsconfig.spec.json new file mode 100644 index 00000000..04480f69 --- /dev/null +++ b/packages/search-api-graphql/tsconfig.spec.json @@ -0,0 +1,29 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "outDir": "./out-tsc/vitest", + "types": [ + "vitest/globals", + "vitest/importMeta", + "vite/client", + "node", + "vitest" + ] + }, + "include": [ + "test/**/*.test.ts", + "test/**/*.spec.ts", + "test/**/*.test.tsx", + "test/**/*.spec.tsx", + "test/**/*.test.js", + "test/**/*.spec.js", + "test/**/*.test.jsx", + "test/**/*.spec.jsx", + "test/**/*.d.ts" + ], + "references": [ + { + "path": "./tsconfig.lib.json" + } + ] +} diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts new file mode 100644 index 00000000..725cf854 --- /dev/null +++ b/packages/search-api-graphql/vite.config.ts @@ -0,0 +1,21 @@ +/// +import { defineConfig, mergeConfig } from 'vite'; +import baseConfig from '../../vite.base.config.js'; + +export default mergeConfig( + baseConfig, + defineConfig({ + root: __dirname, + cacheDir: '../../node_modules/.vite/packages/search-api-graphql', + test: { + coverage: { + thresholds: { + functions: 90, + lines: 90, + branches: 78, + statements: 90, + }, + }, + }, + }), +); diff --git a/tsconfig.json b/tsconfig.json index 0b6d2b2c..0defc069 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -76,6 +76,9 @@ }, { "path": "./packages/search" + }, + { + "path": "./packages/search-api-graphql" } ] } From df128c46551ec8ee423da97f16a3181917109c94 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 09:38:15 +0200 Subject: [PATCH 04/13] docs(search): reconcile ADRs 0003 and 0004 with the NDE stack docs - state the decisions directly as the reconciled architecture, not deviations from a draft - remove the deviation/reconcile framing and the deviations-to-reconcile lists - align wording with the stack platform layer --- .../0003-search-api-core-query-model.md | 44 +++++++------------ .../0004-search-api-graphql-surface.md | 28 +++++------- 2 files changed, 27 insertions(+), 45 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 57521fad..c093a6f8 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,10 +6,9 @@ Date: 2026-06-25 Proposed -Reconciles against the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`), which are themselves -a **draft under discussion**, so several decisions below are deliberate deviations from -the current draft, to be reconciled back into it. +Aligned with the NDE stack platform docs +(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`); the decisions below are +reflected there. ## Context @@ -19,10 +18,9 @@ declarative source so the GraphQL surface, a later REST surface, and the index c from each other, and so a deployment can swap search engines without consumers noticing. That requires an engine- and protocol-neutral **core** that both API surfaces and any -engine adapter sit on. The platform draft frames this as Ports & Adapters with a framed -JSON-LD intermediate representation, generated from SHACL + a `search:` annotation -vocabulary. We adopt that direction but scope it to what a v1 keyword search needs, and -diverge on a few concrete points where the draft does not fit DR’s catalog-search case. +engine adapter sit on. The architecture is Ports & Adapters with a framed JSON-LD +intermediate representation, generated from SHACL + a `search:` annotation vocabulary, +scoped here to what a v1 keyword search needs. ## Decision @@ -37,9 +35,6 @@ Two tiers: `search-*` is backend you compose; `search-api-*` is the surface you | API surface | `@lde/search-api-graphql` | field model + `SearchQuery` → GraphQL schema (runtime configuration; see [ADR 4](./0004-search-api-graphql-surface.md)) | | API surface | `@lde/search-api-rest` | OpenAPI + route handlers (later, thin over the core) | -This deviates from the draft’s function-mapping table (`@lde/graphql-server`, -`@lde/rest-server`, no core row); the draft should adopt the `@lde/search*` family. - ### Contract frozen, storage swappable The **API contract** (the SDL shape consumers couple to) is breaking to change and must be @@ -254,7 +249,7 @@ per-shape types (e.g. `Organization`, `Term`) with `label` exposed as `name` - **IR / adapter-return:** JSON-LD language map (`@container: @language`), `@set` arrays, `und` for untagged. Matches schema-profile #171 (language maps are more usable as a data - model) and the platform draft’s envelope. + model) and the stack platform envelope. - **GraphQL surface:** a single **best-first** `Accept-Language`-ordered list (`[LanguageString!]!`, see [ADR 4](./0004-search-api-graphql-surface.md)). `[0]` is the value to display; **`[0].language` is the language actually served** – the per-field @@ -270,7 +265,7 @@ argument (deferred): a parallel arg would duplicate the header and need preceden Chosen over a `{nl,en}` map (silently yields `undefined` for a missing language, no defined fallback order) and over a separate resolved scalar (the value must be a `LanguageString` to carry its language anyway, so the scalar saved only the `[0]` index – not worth a second -field plus a deviation from the draft / Network-of-Terms list shape). Grounded in measured +field plus diverging from the Network-of-Terms list shape). Grounded in measured data and all three substrates: - **A (descriptions, measured):** bilingual `nl`/`en`, ~86% Dutch-only → an English user gets @@ -284,31 +279,26 @@ have an English title) is distinct from content `dct:language` (already filterab preference; expressible as a facetable dimension (languages-present-in-a-localized-field), not enabled for DR v1, more relevant for B/C. -### Other reconciled decisions +### Other decisions - **Numbered pagination** (`offset`/`limit`, presented as page/per-page), not Relay cursors. DR is a page-numbered faceted browser with totals; Typesense is natively page/per-page; the ~2,500-doc corpus never paginates deep enough for offset cost to bite; and the blue/green alias swap removes the mutation-drift that motivates cursors. - **Sidecar canonical labels**, not inline `labelOnly` as default. Facets need one - canonical label per entity; the draft’s own two-source model puts canonical labels in a - separate collection, which is what DR’s `labels` collection is. `nestedStrategy` is - carried as metadata but inline `labelOnly` is not the default. -- **Logical typed result document** at the query seam; framed JSON-LD kept index-side. The - draft treats framed JSON-LD as the universal IR; we scope it to the index/projection - artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not catalog-search’s), - gated on the generic framing packages existing rather than on DR. + canonical label per entity, kept in a separate collection — DR’s `labels` collection. A + reference’s `strategy` is carried as metadata; `labelOnly` is the v1 default, not inline. +- **Logical typed result document** at the query seam; framed JSON-LD kept index-side as the + index/projection artifact (its payoff – vector/LDES/UI sinks – is object-search’s, not + catalog-search’s), gated on the generic framing packages existing rather than on DR. ## Consequences - One declarative source drives GraphQL, later REST, and the index; they cannot drift. - The engine is a swappable adapter; the contract outlives engine choices. -- Adopted from the draft unchanged: the Stable API Contract discipline, `nestedStrategy` as - a concept, the surface `LanguageString` list, folding at the adapter boundary + query - side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Deviations to reconcile into the platform draft: numbered pagination; sidecar labels; - logical result doc (framed JSON-LD scoped to index-side); `min`/`max` filter ranges; the - `@lde/search*` naming and a core package row. +- Carried through: the Stable API Contract discipline, the reference `strategy` concept, the + surface `LanguageString` list, folding at the adapter boundary + query side via + `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. - Adopted during implementation (2026-06-26): the **unified** field model – the projection `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this one `SearchField` (see the Field model note above). diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index 54c34000..f16c066b 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,8 +11,8 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The platform draft requires the surface to be derived from the same -source as the index, never hand-written, so it cannot drift. It must also be framework-free: +API surface is GraphQL. The surface is derived from the same source as the index, never +hand-written, so it cannot drift. It must also be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, is a separate package). @@ -21,12 +21,11 @@ is a separate package). ### Runtime configuration, not code generation -The platform draft frames this as _generating_ the surface – emitting GraphQL SDL **and** -resolvers as artifacts. We deviate: nothing is emitted or committed. The schema is -**constructed at runtime from the field-model configuration** (`buildSearchSchema(config)`), -once at startup, and the resolvers are **generic functions inside the package** attached to -that schema. A better name for the draft’s “generation” step, at least for this surface, is -**runtime configuration**. +The surface is **constructed at runtime from the field-model configuration** +(`buildSearchSchema(config)`), once at startup, with the resolvers as **generic functions +inside the package** attached to that schema. Nothing is emitted or committed — there is no +generated GraphQL SDL or resolver artifact. The accurate name for this step is **runtime +configuration**, not generation. This matters because the resolvers are inherently generic – there is essentially one root resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; @@ -38,10 +37,6 @@ need no committed `.graphql` file. The field-model diff is the reviewable change `printSchema()` helper exists only as an **optional** CI snapshot test for catching accidental breaking changes to the frozen contract – not a shipped artifact. -> Deviation from the stack draft: the draft’s “generate SDL + resolvers” becomes -> _construct the schema at runtime from configuration; resolvers are generic and in-package; -> SDL is served live via introspection, not emitted._ For the reconciliation list. - ### The schema-building function ```ts @@ -345,12 +340,9 @@ Each transport populates it per request; no framework type appears in the packag facet types. Breaking to change – right in v1. - **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, the `SearchDocument` shape. -- **Deviations to reconcile into the platform draft:** - - “generate SDL + resolvers” → _runtime configuration_ (construct at startup from config; - generic in-package resolvers; SDL served via introspection, not emitted as an artifact). - - Named reference types per shape (`Organization`, `Term`) rather than the draft’s uniform - `labelOnly` `{ @id, @type, name }` reference shape – chosen for ergonomics and - additive `inline` growth. +- **Named reference types** per shape (`Organization`, `Term`) rather than a single uniform + reference type – chosen for ergonomics and additive `inline` growth (`labelOnly` → `inline` + only adds fields, non-breaking). - Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit From c9c28232c679b1f11a69d944fe5ddf4f7ac5cd3b Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 10:30:08 +0200 Subject: [PATCH 05/13] feat(search): project number-kind fields - number fields now project as floats (not truncated like integer) - closes the step-1 gap so an int64-magnitude field mapped to number (Float) indexes --- packages/search/src/project.ts | 14 ++++++++++++-- packages/search/test/project.test.ts | 11 +++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index 284c3183..71e2416e 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -78,6 +78,12 @@ function applyField( field.name, toInteger(firstLiteralOf(node, path)), ); + case 'number': + return setNumber( + document, + field.name, + toNumber(firstLiteralOf(node, path)), + ); case 'date': return setNumber( document, @@ -85,8 +91,8 @@ function applyField( isoToUnix(firstLiteralOf(node, path)), ); } - // `number` and `boolean` are not projected from a path in current schemas - // (booleans are derivation-populated, e.g. the compatibility vinkjes). + // `boolean` is not projected from a path in current schemas — booleans are + // derivation-populated (e.g. the compatibility vinkjes). } /** @@ -236,6 +242,10 @@ function toInteger(literal: string | undefined): number | undefined { return literal === undefined ? undefined : Math.trunc(Number(literal)); } +function toNumber(literal: string | undefined): number | undefined { + return literal === undefined ? undefined : Number(literal); +} + function isoToUnix(iso: string | undefined): number | undefined { if (iso === undefined) { return undefined; diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 8f513baa..592caac6 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -146,6 +146,17 @@ describe('projectDocument', () => { expect(document.class).toEqual(['http://example.org/BareClass']); }); + it('projects a number field as a float (not truncated like integer)', () => { + const document = projectDocument( + { '@id': 'https://ex/d/12', [`${DR}size`]: { '@value': '1234.5' } }, + { + type: DATASET, + fields: [{ name: 'size', path: `${DR}size`, kind: 'number' }], + }, + ); + expect(document.size).toBe(1234.5); + }); + it('folds the transformed values (not the raw ones) for a facet search field', () => { const document = projectDocument( { '@id': 'https://ex/d/4', [`${DR}format`]: [`${IANA}text/turtle`] }, From 0c1b0865a415752dc13716a7182d5fa455c8e72f Mon Sep 17 00:00:00 2001 From: David de Boer Date: Mon, 29 Jun 2026 12:11:43 +0200 Subject: [PATCH 06/13] docs(search): link ADR 3 to the published stack platform docs Replace the repo-path breadcrumb with a direct link to the docs site, so the status note points readers at the rendered page rather than a source file path. --- docs/decisions/0003-search-api-core-query-model.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index c093a6f8..e931d849 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -6,9 +6,8 @@ Date: 2026-06-25 Proposed -Aligned with the NDE stack platform docs -(`netwerk-digitaal-erfgoed/docs` → `docs/stack/layers/platform.md`); the decisions below are -reflected there. +Aligned with the NDE [stack platform docs](https://docs.nde.nl/stack/layers/platform); the +decisions below are reflected there. ## Context From 42e8d11074d89491c2d5672021bf0749947c8e68 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 09:47:38 +0200 Subject: [PATCH 07/13] feat(search)!: keyed facet surface, range facets, label cache; remove the group companion - Keyed per-type facets object on the GraphQL surface (ValueBucket / RangeBucket), selection-is-the-request with skip-own-filter. - Numeric range facets and an opt-in label cache in the Typesense adapter. - Reconcile ADRs 0003 and 0004 with the implementation. BREAKING CHANGE: remove SearchField.group and its *_group companion field, collection column and query split. Deployments denormalize group tokens into the field values instead, so a group is an ordinary facet value with no engine mechanism. --- .../0003-search-api-core-query-model.md | 36 +- .../0004-search-api-graphql-surface.md | 256 +++++++------- .../search-api-graphql/src/build-schema.ts | 125 +++++-- .../generator-stability.test.ts.snap | 23 +- .../test/build-schema.test.ts | 182 +++++++++- packages/search-api-graphql/vite.config.ts | 8 +- .../search-typesense/src/collection-schema.ts | 8 - .../search-typesense/src/query-compiler.ts | 76 ++-- packages/search-typesense/src/search.ts | 212 ++++++++++-- .../generator-stability.test.ts.snap | 6 - .../test/collection-schema.test.ts | 10 - .../test/generator-stability.test.ts | 5 +- .../test/parse-response.test.ts | 326 +++++++++++++++++- .../test/query-compiler.test.ts | 53 ++- packages/search-typesense/vite.config.ts | 8 +- packages/search/src/engine.ts | 13 +- packages/search/src/index.ts | 1 + packages/search/src/schema.ts | 33 +- packages/search/test/schema.test.ts | 21 -- packages/search/vite.config.ts | 6 +- 20 files changed, 1071 insertions(+), 337 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index e931d849..38f9e697 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -72,6 +72,7 @@ interface SearchField { readonly kind: FieldKind; readonly path?: string; // sh:path to project from; omit for a derivation-populated field readonly array?: boolean; // sh:maxCount + readonly required?: boolean; // sh:minCount ≥ 1 — non-null in output, non-optional in the index readonly localized?: boolean; // rdf:langString / sh:languageIn (text only) readonly locales?: readonly string[]; // when localized: which languages to emit readonly output?: boolean; // appears in the schema output type @@ -81,7 +82,7 @@ interface SearchField { readonly sortable?: boolean; readonly ref?: { type: string; strategy: 'labelOnly' | 'idOnly' | 'inline' }; // kind: 'reference' readonly transform?: (value: string) => string; // projection-time value transform - readonly group?: { readonly name: string; readonly prefix: string }; // deployment delta + readonly facetRanges?: readonly FacetRange[]; // numeric facet: fixed [min, max) range bins (histogram) vs per-value buckets } type Derivation = (document: SearchDocument, node: FramedNode) => void; @@ -89,7 +90,7 @@ type Derivation = (document: SearchDocument, node: FramedNode) => void; interface SearchSchema { readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; - readonly derivations?: readonly Derivation[]; // computed fields: status, *_group, booleans + readonly derivations?: readonly Derivation[]; // computed fields: status, booleans } ``` @@ -100,10 +101,11 @@ eventual generator emits it unchanged. A field with **no `path`** is a derived f populated by a `Derivation` rather than projected from the IR – yet it still carries full query/schema/output behavior, which is how the former separate projection `FieldSpec` is subsumed. The physical field names a declaration fans out to (`${name}_search_${locale}`, -`${name}_sort_${locale}`, `${name}_search`, `${name}_group`) follow one convention owned by -`@lde/search`, so projection, collection schema and query compiler agree. The `group` -companion (coarse grouped facets, e.g. `format_group`) and the `status_rank` tie-break sort -are **deployment-specific deltas**, never in `@lde/search`. `relevance` is _not_ a delta: +`${name}_sort_${locale}`, `${name}_search`) follow one convention owned by +`@lde/search`, so projection, collection schema and query compiler agree. The `status_rank` +tie-break sort is a **deployment-specific delta**, never in `@lde/search`. Grouped facets need +no field-model mechanism at all: a deployment derivation materializes group tokens (e.g. +`group:rdf`) into the field’s own values – see Consequences. `relevance` is _not_ a delta: every full-text engine ranks by match score, so it is a generic reserved sort the adapter understands. @@ -164,8 +166,26 @@ variable-based clients (`$o: DatasetOrderBy`) break, so a future array is a deli **Inclusive bounds only** – `min`/`max`, no `gt`/`gte`/`lt`/`lte`: self-documenting, matches Typesense’s native inclusive range, covers every DR case, additively reversible. -Grouped facets need no special shape – `group:`-prefixed tokens travel as ordinary `in` -strings and the adapter splits/unions them. +A numeric facet returns **range buckets** (`[min, max)` bins declared per field); the adapter +maps them to the engine’s native range faceting. + +**Grouped facets need no special engine mechanism; they are denormalized at index time.** +A coarse category alongside granular values (e.g. `group:rdf` next to media types, `group:person` +next to class IRIs) is materialized into the field’s own values during projection, so at query +time a group token is an ordinary value: faceted natively, filtered by plain membership +(`field.in: ["group:rdf"]` unions with granular values for free), and — where the field is +`output` – read like any other value. There is no `_group` companion, no `group:`-prefix split, +no filter rewriting in the adapter; the engine stays dumb and denormalization (the document +store’s strength) does the work. A cross-source signal that is not a subset of the field (e.g. a +SPARQL capability derived from `conformsTo`, not a media type) is likewise materialized as a plain +value by a deployment derivation. + +The trade-off this design accepts: **group membership is fixed at index time.** Because the +group token is baked into each document’s values during projection, redefining a group (which +granular values map to `group:rdf`) is an index-data change that takes effect only on **reindex** – +there is no query-time mapping to edit. The constraint is acceptable here because group definitions +are deployment projection config that already drives indexing, and reindexing is already the +pipeline’s job; it would not suit a system where grouping is user-defined or changes frequently. ### Engine port and result diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index f16c066b..c5b297da 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -11,31 +11,26 @@ Builds on [ADR 3 (Search API core query model)](./0003-search-api-core-query-mod ## Context Given the engine-neutral core of [ADR 3](./0003-search-api-core-query-model.md), the first -API surface is GraphQL. The surface is derived from the same source as the index, never -hand-written, so it cannot drift. It must also be framework-free: -resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any GraphQL server -can host the schema (DR mounts it inline; a Fastify wrapper is deferred and, if ever built, -is a separate package). +API surface is GraphQL, derived from the same source as the index so it cannot drift. It must +be framework-free: resolvers are standard `graphql-js`, not tied to Fastify/Mercurius, so any +GraphQL server can host the schema (DR mounts it inline; a Fastify wrapper is a deferred +separate package). ## Decision ### Runtime configuration, not code generation The surface is **constructed at runtime from the field-model configuration** -(`buildSearchSchema(config)`), once at startup, with the resolvers as **generic functions -inside the package** attached to that schema. Nothing is emitted or committed — there is no -generated GraphQL SDL or resolver artifact. The accurate name for this step is **runtime -configuration**, not generation. - -This matters because the resolvers are inherently generic – there is essentially one root -resolver that maps args to a `SearchQuery`, calls the engine, and maps the result back; -the field model only parameterises data. Codegen would emit N near-identical resolver stubs +(`buildSearchSchema(config)`), once at startup, with generic resolvers shipped in the package +attached to that schema – nothing is emitted or committed. The resolvers are inherently +generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result +back; the field model only parameterises data), so codegen would emit N near-identical stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. -**No SDL artifact.** A live GraphQL API serves its own schema via introspection, so clients -need no committed `.graphql` file. The field-model diff is the reviewable change. A -`printSchema()` helper exists only as an **optional** CI snapshot test for catching -accidental breaking changes to the frozen contract – not a shipped artifact. +A live GraphQL API serves its own schema via introspection, so clients need no committed +`.graphql` file; the field-model diff is the reviewable change. `printSearchSchema()` exists +only as an **optional** CI snapshot test guarding the frozen contract against accidental +breaking changes – not a shipped artifact. ### The schema-building function @@ -80,21 +75,17 @@ composes the exported typeDefs/resolvers by hand. ### A typed surface the contract does not depend on -Because `buildSearchSchema` is generic over the config _value_ (``), one -`as const satisfies SearchSchema` declaration drives two **independent** projections: +One `as const satisfies SearchSchema` declaration drives two **independent** projections: the +**runtime contract** (the `GraphQLSchema`, built at startup by reading the value – +`field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / +`WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). -- **the runtime contract** – the `GraphQLSchema`, built at startup by reading the value - (`field.kind`, `output`, `facetable`, …); and -- **a static TS mirror** – `OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, - computed from `typeof schema` via mapped types. - -The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time -only and TS types are erased, so the served schema is byte-identical whether or not the -mirror types exist – they are a developer-experience overlay, never the source. The two are -parallel derivations of one value: the runtime kind→GraphQL-type mapping lives in -`buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it. They can drift, -so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot test (the -real artifact), while the TS mirror only catches our own coding mistakes against it. +The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only +and erased, so the served schema is byte-identical whether or not the mirror exists – it is a +developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type +mapping lives in `buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it), +so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot (the real +artifact), while the TS mirror only catches our own coding mistakes against it. Values are typed at both ends, with the resolver as the typed transform between them: @@ -103,26 +94,27 @@ Values are typed at both ends, with the resolver as the typed transform between | IR (`ResultDocument`) | `LocalizedValue` (lang map) | `Reference` | `number` | `readonly string[]` | `boolean` | | GraphQL (`OutputOf`) | `LanguageString[]` (best-first list) | named type (`Organization`) | `Float`/`number` | `[String!]!`/`string[]` | `Boolean!`/`boolean` | -What stays unchecked is only the **generic resolver’s dynamic middle**: it loops over the +What stays unchecked is only the generic resolver’s **dynamic middle**: it loops over the field model with runtime-string names, so TS cannot prove the object it builds matches `OutputOf` – it casts at that boundary, and graphql-js’s executor (not TS) enforces the -output types at runtime (a wrong-typed return raises a field error). This is the same -“typed boundaries, dynamic middle” shape as the engine port and the projection: type the -edges where it is honest, accept a cast where iteration is inherently dynamic. +output types at runtime (a wrong-typed return raises a field error). Same “typed boundaries, +dynamic middle” shape as the engine port and the projection: type the edges where it is +honest, accept a cast where iteration is inherently dynamic. ### Construction rules (field model → schema) -Type names derive from `typeName`; shared types (`LanguageString`, `Facet`, `FacetBucket`, -`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once. +Type names derive from `typeName`; shared types (`LanguageString`, `ValueBucket`, `RangeBucket`, +`SortDirection`, `StringFilter`, `IntRange`, `FloatRange`, `DateRange`) are emitted once, and the +per-type keyed facets object is named `Facets`. GraphQL field names are the field model `name` verbatim (declare camelCase). - **Output type** – one field per `output` field: `text`+`localized` → `[LanguageString!]!` (best-first; `[0].language` = served language, the per-field `Content-Language`); `keyword` array → `[String!]!`, scalar → `String`; `integer` → `Int` (signed 32-bit); `number` → `Float` (exact integers to 2^53); `date` → `String` (ISO 8601); `boolean` → `Boolean!` (absent = false); `reference` → see below. Nullability from `array` / required / - optional; `id` is `String!`. A field whose magnitude can exceed 32 bits (a 64-bit count or - byte size – e.g. DR’s `size`) is modelled as `number` → `Float`, since GraphQL’s `Int` - would overflow; a `Long`/`BigInt` custom scalar is the deferred alternative. + optional; `id` is `String!`. A magnitude that can exceed 32 bits (a 64-bit count or byte size + – e.g. DR’s `size`) is `number` → `Float`, since `Int` would overflow; a `Long`/`BigInt` + custom scalar is the deferred alternative. - **Reference types** – a `reference` field is typed by the **referenced shape** (`sh:class`/`sh:node`), emitted once and reused by every field referencing the same shape. Its fields follow `nestedStrategy`: @@ -134,9 +126,9 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). | `inline` (later) | the named type plus the referenced shape’s projected fields | So DR emits `publisher: Organization` (the `foaf:Agent` shape) and - `terminologySource: [Term!]!`; a shape’s type is emitted once and reused by any field that - references it. Named, not a generic GraphQL `Reference`: going `labelOnly → inline` then - only _adds_ fields (non-breaking), whereas generic→named later would break the contract. + `terminologySource: [Term!]!`. Named, not a generic GraphQL `Reference`: going + `labelOnly → inline` then only _adds_ fields (non-breaking), whereas generic→named later + would break the contract. - **`where` input** – one field per `filterable` field: `keyword`/`reference` → `StringFilter { in: [String!] }`; `integer` → `IntRange { min, max }`; `number` → @@ -144,22 +136,32 @@ GraphQL field names are the field model `name` verbatim (declare camelCase). `is` value); `text` is excluded (it goes through the `query` arg). - **`orderBy`** – `RELEVANCE` (the sane default when a `query` is present) plus every `sortable` field, as an enum, in a single `{ field, direction }` input. Only - publicly-selectable sorts appear here; the resolver expands the client’s one choice into - the internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via - `queryDefaults` (never exposed). Single for now because a user picks one dimension. - Promoting it to a list later is backward-compatible only for inline-literal clients (list - input coercion wraps a single value); **variable-based clients break** (`$o: DatasetOrderBy` - is rejected where `[DatasetOrderBy!]` is expected), so a future array is a deliberate, - potentially breaking change – not a free one. -- **Facets** – an enum of every `facetable` field; requested per query, returned with counts. - A bucket’s `value` is its selection key; `label` is the **nullable** display label. - The engine resolves `label` only for **reference** facets — IRI-keyed buckets whose - canonical multilingual label is _data_, fetched from the sidecar `labels` collection in the - same lookup as hit references. It is `null` for token facets (e.g. `status`) and - free-string facets (e.g. `keyword`): those carry no data label, and the consumer owns their - display — its own i18n catalog for controlled tokens (`valid` → “Geldig”/“Valid”, which the - engine cannot and must not fabricate), or the `value` itself for free strings. The null is - load-bearing: it tells a client whether a server-resolved label exists or display is theirs. + publicly-selectable sorts appear; the resolver expands the client’s one choice into the + internal `Sort[]`, appending deployment tie-breaks like DR’s `status_rank` via + `queryDefaults` (never exposed). Single for now because a user picks one dimension; promoting + it to a list later is backward-compatible only for inline-literal clients (list input + coercion) – **variable-based clients break** (`$o: DatasetOrderBy` where `[DatasetOrderBy!]` + is expected) – so a future array is a deliberate, potentially breaking change. +- **Facets** – a **keyed object** (`Facets`), one field per `facetable` field, typed by + the field’s kind: a numeric range-facet field is `[RangeBucket!]!`, every other facet is + `[ValueBucket!]!`. The facet set and each bucket shape are thus encoded **statically in the + schema**, not discovered at runtime through an enum + polymorphic bucket (no `__typename`, no + fragments). **Selection is the request**: only the facet keys a query selects are computed + (the resolver inspects the selection), each with its **own where-filter removed** + (skip-own-filter – a multi-select facet still lists its other options; dropping a `status` + filter also drops the valid-only default, so the status facet counts across every status). + Two bucket types: + - `ValueBucket { value, count, label }` – `value` is the selection key (filter via + `field.in`); `label` (nullable) is the engine-resolved canonical **data** label, present + only for **reference** (IRI-keyed) facets, `null` for token/free-string facets whose + display the consumer owns (its i18n for controlled tokens like `valid` → “Geldig”/“Valid”, + or the `value` itself). The null is load-bearing. + - `RangeBucket { min, max, count }` – a half-open `[min, max)` numeric bin (`max` null on an + open-ended top bin), filtered via `field.range`. + - A grouped facet (a coarse category alongside granular values, e.g. `group:rdf` next to media + types) needs **no special bucket**: its tokens are denormalized into the field at index time, + so they are ordinary `ValueBucket` values – faceted, filtered (`field.in: ["group:rdf"]`) and, + where output, read like any other value (see ADR 0003). ### Resulting schema (DR example, abridged) @@ -184,29 +186,15 @@ type Dataset { publisher: Organization terminologySource: [Term!]! format: [String!]! - class: [String!]! - size: Float # int64 magnitude → Float, not Int (32-bit); see note below + size: Float # int64 magnitude → Float, not Int (32-bit) datePosted: String status: String iiif: Boolean! # … keyword, language, iiifManifestCount, ndeSchemaAp, linkedData, terms, persistentUris } -input StringFilter { - in: [String!] -} -input IntRange { - min: Int - max: Int -} -input FloatRange { - min: Float - max: Float -} -input DateRange { - min: String - max: String -} +# shared inputs are emitted once and reused: DR uses StringFilter + FloatRange + +# SortDirection (IntRange / DateRange are pruned – no filterable int/date field). input DatasetWhere { publisher: StringFilter @@ -214,9 +202,7 @@ input DatasetWhere { class: StringFilter status: StringFilter size: FloatRange - datePosted: DateRange - iiif: Boolean - # … keyword, language, terminologySource, catalog, ndeSchemaAp, linkedData, terms, persistentUris + # … keyword, language, terminologySource, catalog } enum DatasetSortField { @@ -225,37 +211,31 @@ enum DatasetSortField { DATE_POSTED SIZE } -enum SortDirection { - ASC - DESC -} input DatasetOrderBy { field: DatasetSortField! direction: SortDirection! = DESC } -enum DatasetFacetField { - PUBLISHER - KEYWORD - LANGUAGE - FORMAT - CLASS - TERMINOLOGY_SOURCE - STATUS - IIIF - NDE_SCHEMA_AP - LINKED_DATA - TERMS - PERSISTENT_URIS +type ValueBucket { + value: String! # selection key: a media type, a token (group:rdf), or an IRI for reference facets + count: Int! + label: [LanguageString!] # nullable; resolved data label for reference facets, else null } -type FacetBucket { - value: String! # the selection key (an IRI for reference facets, else a token/string) +type RangeBucket { + min: Float # half-open [min, max); max null = open-ended top bin + max: Float count: Int! - label: [LanguageString!] # nullable — see below } -type Facet { - field: DatasetFacetField! - buckets: [FacetBucket!]! +type DatasetFacets { + # one field per facetable field, typed by kind; selection = request, skip-own-filter applied + publisher: [ValueBucket!]! + keyword: [ValueBucket!]! + language: [ValueBucket!]! + format: [ValueBucket!]! + class: [ValueBucket!]! + terminologySource: [ValueBucket!]! + status: [ValueBucket!]! + size: [RangeBucket!]! } type DatasetSearchResult { @@ -263,7 +243,7 @@ type DatasetSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: DatasetFacets! } type Query { @@ -272,19 +252,20 @@ type Query { where: DatasetWhere orderBy: DatasetOrderBy page: Int = 1 - perPage: Int = 20 - facets: [DatasetFacetField!] + perPage: Int = 20 # no `facets` arg – selecting facet keys IS the request ): DatasetSearchResult! } ``` Numbered pagination (`page`/`perPage` + `total`), per [ADR 3](./0003-search-api-core-query-model.md) – no Relay connection. The reference types -(`Organization`, `Term`) carry `id + name` (labelOnly) from DR’s sidecar labels collection, -resolved by the adapter. `publisher` is single (`dct:publisher` `maxCount 1`); `creator` is -search-only – its name feeds full-text `query` but it has no output field of its own, -mirroring the current card. `catalog` is filter-only, so it appears in `where` but not as an -output field. +carry `id + name` (labelOnly) from DR’s sidecar labels collection, resolved by the adapter. +`publisher` is single (`dct:publisher` `maxCount 1`); `creator` is search-only (its name feeds +full-text `query` but it has no output field); `catalog` is filter-only (in `where`, not output); +`class` is facet + filter but not output (its `group:` tokens surface only as facet buckets, never +as card values); `datePosted` is sortable + output only; and the NDE compatibility booleans +(`iiif`, `ndeSchemaAp`, `linkedData`, `terms`) are output-only vinkjes – in neither `where` nor the +facets until “filter by vinkje” ships. ### Resolver behaviour @@ -293,31 +274,27 @@ The single, generic root resolver (shipped in the package, not emitted): 1. **Args → `SearchQuery`** (pure): `query`→`text`; `where`→`Filter[]`; `orderBy`→`Sort[]` (`RELEVANCE`→reserved `relevance`); `page`/`perPage`→`offset`/`limit`; `facets`→logical names; `locale`←`context.acceptLanguage[0]`. -2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; - DR injects its policy here: default `status:=valid`; default sort `relevance` when a - `query` is present else `title`; and the `status_rank` tie-break appended to either. +2. **Apply `options.queryDefaults`** – the generic resolver bakes no deployment defaults; DR + injects its policy here: default `status:=valid`; default sort `relevance` when a `query` is + present else `title`; and the `status_rank` tie-break appended to either. 3. **`context.engine.search(query, schema)` → `SearchResult`.** 4. **`SearchResult` → output** – scalars pass through; a `LocalizedValue` map → - `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; - reference values likewise; facets keyed logical→enum. GraphQL field selection prunes. + `[LanguageString]` ordered by `options.languageOrder(available, acceptLanguage)`; reference + values likewise; facets keyed logical→enum. GraphQL field selection prunes. -Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, -then untagged (`und`) last – so `[0]` is always the best available value. +Default `languageOrder`: Accept-Language entries first, then remaining tagged languages, then +untagged (`und`) last – so `[0]` is always the best available value. ### Lifecycle and performance -- **Built once at startup.** The consumer calls `buildSearchSchema` during boot and hands - the single `GraphQLSchema` to its server; the field model is static per deployment, so it - is never rebuilt per request. -- **Held and reused.** That one schema serves every request (Mercurius additionally - caches/compiles it). -- **Zero per-request penalty vs codegen.** A runtime-constructed schema is the same - `GraphQLSchema` object codegen would have produced; the only added cost is the one-time - build, sub-millisecond to low-single-digit-ms for a schema this size. +- **Built once at startup, reused for every request.** The field model is static per + deployment, so the single `GraphQLSchema` is constructed during boot (sub-millisecond to + low-single-digit-ms for a schema this size) and never rebuilt per request – the same object + codegen would have produced, with no per-request penalty (Mercurius additionally caches it). - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. -- **Introspection serves the contract.** Cheap (a query against the built schema, cached by - clients). Leave it on, or disable in production and use `printSearchSchema` for tooling. +- **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in + production and use `printSearchSchema` for tooling. ### Context contract @@ -333,19 +310,18 @@ Each transport populates it per request; no framework type appears in the packag ## Consequences - The GraphQL surface is configured at runtime from the - [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the - index or a later REST surface, and works under any GraphQL server. + [ADR 3](./0003-search-api-core-query-model.md) field model, so it cannot drift from the index + or a later REST surface, and works under any GraphQL server. - **Frozen (public contract):** `LanguageString`, the named reference types (`Organization`, `Term`, …), output types, `where` operators, `orderBy` enums, numbered-pagination args, facet types. Breaking to change – right in v1. -- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes - facets, the `SearchDocument` shape. -- **Named reference types** per shape (`Organization`, `Term`) rather than a single uniform - reference type – chosen for ergonomics and additive `inline` growth (`labelOnly` → `inline` - only adds fields, non-breaking). -- Deferred: a `dataset(id)` single-resource query (detail-page-on-index direction; DR detail - stays on SPARQL); cross-collection `@reference` joins beyond inline labels; cursor - pagination; a `Date` scalar (kept ISO `String`) and a `Long`/`BigInt` scalar for 64-bit - integers (kept `Float`); transport-layer persisted queries / cost - limits; a root or per-field language argument (Accept-Language is the sole preference - mechanism); metadata-language-availability filtering (a facetable dimension, not v1). +- **Internal:** args→`SearchQuery` mapping, language ordering, how the adapter computes facets, + the `SearchDocument` shape. +- **Named reference types** per shape rather than one uniform reference type – chosen for + ergonomics and additive `inline` growth (`labelOnly` → `inline` only adds fields). +- Deferred: a `dataset(id)` single-resource query (DR detail stays on SPARQL); cross-collection + `@reference` joins beyond inline labels; cursor pagination; a `Date` scalar (kept ISO + `String`) and a `Long`/`BigInt` scalar for 64-bit integers (kept `Float`); transport-layer + persisted queries / cost limits; a root or per-field language argument (Accept-Language is the + sole preference mechanism); metadata-language-availability filtering (a facetable dimension, + not v1). diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index fdfccf09..f7449793 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -42,6 +42,12 @@ export interface SearchContext { readonly engine: SearchEngine; /** Parsed, ordered `Accept-Language`; drives locale selection and output order. */ readonly acceptLanguage: readonly string[]; + /** + * Called when a single facet's computation fails. The facet degrades to an + * empty list (a supplementary facet must not fail the whole query); supply + * this to log the cause. Optional — omit to swallow silently. + */ + readonly onFacetError?: (field: string, error: unknown) => void; } export interface BuildSearchSchemaOptions { @@ -98,14 +104,14 @@ export function buildSearchSchema( value: { type: new GraphQLNonNull(GraphQLString) }, }, }); - const facetBucket = new GraphQLObjectType({ - name: 'FacetBucket', + // A plain value facet bucket: a selection key, its count, and (for reference + // facets) the engine-resolved data label; null for token/free-string facets + // whose display the consumer owns. + const valueBucket = new GraphQLObjectType({ + name: 'ValueBucket', fields: { value: { type: new GraphQLNonNull(GraphQLString) }, count: { type: new GraphQLNonNull(GraphQLInt) }, - // Nullable: the resolved data label for a reference facet, else null — - // the consumer owns display for token/free-string facets (its i18n or the - // value itself). label: { type: new GraphQLList(new GraphQLNonNull(languageString)), resolve: (bucket: Source, _args: unknown, context: SearchContext) => { @@ -117,6 +123,22 @@ export function buildSearchSchema( }, }, }); + // A numeric range-facet bin: half-open `[min, max)` bounds (max null on an + // open-ended top bin) and the count of documents in it. + const rangeBucket = new GraphQLObjectType({ + name: 'RangeBucket', + fields: { + min: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.min ?? null, + }, + max: { + type: GraphQLFloat, + resolve: (bucket: Source) => bucket.max ?? null, + }, + count: { type: new GraphQLNonNull(GraphQLInt) }, + }, + }); const sortDirection = new GraphQLEnumType({ name: 'SortDirection', values: { ASC: { value: 'asc' }, DESC: { value: 'desc' } }, @@ -289,19 +311,55 @@ export function buildSearchSchema( }, }); - const facetValues: GraphQLEnumValueConfigMap = {}; - for (const field of facetableFields(schema)) { - facetValues[screamingSnake(field.name)] = { value: field.name }; - } - const facetField = new GraphQLEnumType({ - name: `${typeName}FacetField`, - values: facetValues, - }); - const facet = new GraphQLObjectType({ - name: 'Facet', - fields: { - field: { type: new GraphQLNonNull(facetField) }, - buckets: { type: nonNullListOf(facetBucket) }, + // Keyed facets object: one field per facetable field, typed by its kind + // (range fields → [RangeBucket!], else [ValueBucket!]). Each field's resolver + // computes that facet with its OWN where-filter removed (skip-own-filter), so a + // multi-select facet still lists its other options; only the selected fields + // are resolved (GraphQL prunes the rest), so the selection IS the request. + const facetsType = new GraphQLObjectType({ + name: `${typeName}Facets`, + fields: () => { + const fields: Record< + string, + GraphQLFieldConfig + > = {}; + for (const field of facetableFields(schema)) { + const isRange = + field.facetRanges !== undefined && field.facetRanges.length > 0; + fields[field.name] = { + type: nonNullListOf(isRange ? rangeBucket : valueBucket), + resolve: async ( + source: Source, + _args: unknown, + context: SearchContext, + ) => { + const query = source.query as SearchQuery; + // Drop this facet's own filter so its other options still count + // (a removed `status` filter also drops the valid-only default, so + // the status facet counts across every status). + const facetQuery: SearchQuery = { + ...query, + where: query.where.filter( + (filter) => filter.field !== field.name, + ), + facets: [field.name], + limit: 0, + offset: 0, + }; + // A facet is supplementary: degrade a failed facet to an empty list + // rather than failing the whole query (which would null the non-null + // result and discard the items + every other facet). + try { + const result = await context.engine.search(facetQuery, schema); + return result.facets[field.name] ?? []; + } catch (error) { + context.onFacetError?.(field.name, error); + return []; + } + }, + }; + } + return fields; }, }); @@ -312,7 +370,12 @@ export function buildSearchSchema( total: { type: new GraphQLNonNull(GraphQLInt) }, page: { type: new GraphQLNonNull(GraphQLInt) }, perPage: { type: new GraphQLNonNull(GraphQLInt) }, - facets: { type: nonNullListOf(facet) }, + // Resolved lazily, per selected key (skip-own-filter); the result object + // (which carries the resolved `query`) is the facets source. + facets: { + type: new GraphQLNonNull(facetsType), + resolve: (source: Source) => source, + }, }, }); @@ -327,23 +390,29 @@ export function buildSearchSchema( orderBy: { type: orderByInput }, page: { type: GraphQLInt, defaultValue: 1 }, perPage: { type: GraphQLInt, defaultValue: 20 }, - facets: { type: new GraphQLList(new GraphQLNonNull(facetField)) }, }, resolve: async (_source, args, context: SearchContext) => { const built = argsToQuery(args as QueryArgs, context, schema); const finalQuery = options.queryDefaults ? options.queryDefaults(built, context) : built; - const result = await context.engine.search(finalQuery, schema); + // Items + total only; facets are resolved lazily per selected key. + const result = await context.engine.search( + { ...finalQuery, facets: [] }, + schema, + ); return { items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), total: result.total, - page: Math.floor(finalQuery.offset / finalQuery.limit) + 1, + // Guard against a `perPage: 0` arg: `Math.floor(0/0)` is NaN, which a + // non-null `Int!` cannot serialize and would fail the whole query. + page: + finalQuery.limit > 0 + ? Math.floor(finalQuery.offset / finalQuery.limit) + 1 + : 1, perPage: finalQuery.limit, - facets: Object.entries(result.facets).map(([field, buckets]) => ({ - field, - buckets, - })), + // Carried for the facets resolver (skip-own-filter per key). + query: finalQuery, }; }, }, @@ -372,7 +441,6 @@ interface QueryArgs { readonly orderBy?: { field: string; direction: 'asc' | 'desc' }; readonly page?: number; readonly perPage?: number; - readonly facets?: readonly string[]; } /** Pure args → {@link SearchQuery} mapping. */ @@ -391,7 +459,8 @@ function argsToQuery( : [], limit: perPage, offset: (page - 1) * perPage, - facets: args.facets ?? [], + // Facets are requested per-key by the facets resolver, not via an arg. + facets: [], locale: context.acceptLanguage[0] ?? 'und', }; } diff --git a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap index d1741f4e..63bc19de 100644 --- a/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap +++ b/packages/search-api-graphql/test/__snapshots__/generator-stability.test.ts.snap @@ -2,7 +2,7 @@ exports[`GraphQL generator stability > emits a stable SDL for a representative schema 1`] = ` "type Query { - things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20, facets: [ThingFacetField!]): ThingSearchResult! + things(query: String, where: ThingWhere, orderBy: ThingOrderBy, page: Int = 1, perPage: Int = 20): ThingSearchResult! } type ThingSearchResult { @@ -10,7 +10,7 @@ type ThingSearchResult { total: Int! page: Int! perPage: Int! - facets: [Facet!]! + facets: ThingFacets! } type Thing { @@ -37,20 +37,15 @@ type Agent { name: [LanguageString!]! } -type Facet { - field: ThingFacetField! - buckets: [FacetBucket!]! +type ThingFacets { + keyword: [ValueBucket!]! + creator: [ValueBucket!]! + publisher: [ValueBucket!]! + status: [ValueBucket!]! + open: [ValueBucket!]! } -enum ThingFacetField { - KEYWORD - CREATOR - PUBLISHER - STATUS - OPEN -} - -type FacetBucket { +type ValueBucket { value: String! count: Int! label: [LanguageString!] diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index b61ba240..6ba323de 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -41,7 +41,12 @@ const schema: SearchSchema = { kind: 'integer', filterable: true, sortable: true, + facetable: true, output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10 }, + ], }, { name: 'datePosted', kind: 'date', sortable: true, output: true }, { name: 'score', kind: 'number', output: true }, @@ -148,7 +153,7 @@ describe('buildSearchSchema', () => { status iiif } - facets { field buckets { value count } } + facets { keyword { value count } } } }`, { engine, acceptLanguage: ['nl'] }, @@ -176,9 +181,9 @@ describe('buildSearchSchema', () => { { id: 'https://term/1', name: [{ language: 'nl', value: 'Kaarten' }] }, ]); expect(item.iiif).toBe(true); - expect(data.facets).toEqual([ - { field: 'KEYWORD', buckets: [{ value: 'kaarten', count: 3 }] }, - ]); + expect(data.facets).toEqual({ + keyword: [{ value: 'kaarten', count: 3 }], + }); // The free-text arg became the query text. expect(received().text).toBe('kaart'); }); @@ -213,7 +218,7 @@ describe('buildSearchSchema', () => { ], }); const result = await run( - `{ datasets { items { title { language value } } } }`, + `{ datasets { items { title { language value } datePosted } } }`, { engine, acceptLanguage: ['en'] }, ); const item = ( @@ -226,6 +231,8 @@ describe('buildSearchSchema', () => { { language: 'nl', value: 'Titel' }, { language: null, value: 'Naamloos' }, ]); + // An absent date resolves to null (the non-numeric branch). + expect(item.datePosted).toBeNull(); }); it('labels reference-facet buckets, leaving plain-facet buckets null', async () => { @@ -244,26 +251,165 @@ describe('buildSearchSchema', () => { }, }); const result = await run( - `{ datasets { facets { field buckets { value count label { language value } } } } }`, + `{ datasets { facets { + publisher { value count label { language value } } + keyword { value count label { language value } } + } } }`, { engine, acceptLanguage: ['nl'] }, ); const facets = (result.data?.datasets as Record) - .facets as { field: string; buckets: unknown[] }[]; - const publisher = facets.find((facet) => facet.field === 'PUBLISHER'); - const keyword = facets.find((facet) => facet.field === 'KEYWORD'); - expect(publisher?.buckets).toEqual([ + .facets as { + publisher: unknown[]; + keyword: unknown[]; + }; + expect(facets.publisher).toEqual([ { value: 'https://org/1', count: 2, label: [{ language: 'nl', value: 'Het Utrechts Archief' }], }, ]); - expect(keyword?.buckets).toEqual([ + expect(facets.keyword).toEqual([ { value: 'kaarten', count: 3, label: null }, ]); }); - it('maps where, orderBy, facets and pagination into the SearchQuery', async () => { + it('exposes range-facet bucket bounds, null for value facets and open ends', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { + size: [ + { value: '0', count: 2, min: 1, max: 10 }, + // Open-ended top bin: lower bound only. + { value: '1', count: 5, min: 10 }, + ], + keyword: [{ value: 'kaarten', count: 3 }], + }, + }); + const result = await run( + `{ datasets { facets { + size { min max count } + keyword { value count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as { + size: unknown[]; + keyword: unknown[]; + }; + // RangeBuckets carry their half-open bounds (max null = open-ended top bin). + expect(facets.size).toEqual([ + { min: 1, max: 10, count: 2 }, + { min: 10, max: null, count: 5 }, + ]); + // A value facet's ValueBuckets carry no bounds. + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 3 }]); + }); + + it('resolves every selected facet key, returning [] where the engine has none', async () => { + const { engine } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + const result = await run( + `{ datasets { facets { + keyword { value count } + publisher { value count } + terminologySource { value count } + status { value count } + iiif { value count } + size { min max count } + } } }`, + { engine, acceptLanguage: ['nl'] }, + ); + const facets = (result.data?.datasets as Record) + .facets as Record; + expect(facets.keyword).toEqual([{ value: 'kaarten', count: 1 }]); + // Keys the engine returned nothing for resolve to an empty list. + for (const key of [ + 'publisher', + 'terminologySource', + 'status', + 'iiif', + 'size', + ]) { + expect(facets[key]).toEqual([]); + } + }); + + it('computes a facet with its own where-filter removed (skip-own-filter)', async () => { + const { engine, received } = fakeEngine({ + total: 0, + hits: [], + facets: { keyword: [{ value: 'kaarten', count: 1 }] }, + }); + await run( + `{ datasets(where: { keyword: { in: ["x"] }, status: { in: ["valid"] } }) { + facets { keyword { value count } } + } }`, + { engine, acceptLanguage: ['nl'] }, + ); + // The keyword facet query is run with the keyword filter dropped (so its + // other options still count), but other filters (status) retained. + const facetQuery = received(); + expect(facetQuery.facets).toEqual(['keyword']); + expect( + facetQuery.where.find((filter) => filter.field === 'keyword'), + ).toBeUndefined(); + expect(facetQuery.where).toContainEqual({ field: 'status', in: ['valid'] }); + }); + + it('degrades a failed facet to an empty list without failing the whole query', async () => { + // A facet is supplementary: its computation runs a separate search (with + // `facets` set). Fail only that, leaving the listing search untouched. + const failedFacets: string[] = []; + const engine: SearchEngine = { + async search(query) { + if (query.facets.length > 0) { + throw new Error('facet backend unavailable'); + } + return canned; + }, + }; + const result = await run( + `{ datasets { + total + items { id } + facets { keyword { value count } } + } }`, + { + engine, + acceptLanguage: ['nl'], + onFacetError: (field) => failedFacets.push(field), + }, + ); + + // No top-level error: the failed facet degraded rather than nulling the + // non-null result and discarding the items. + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.total).toBe(1); + expect((data.items as Record[])[0].id).toBe('https://d/1'); + // The failed facet degraded to an empty list, and the cause was reported. + expect((data.facets as Record).keyword).toEqual([]); + expect(failedFacets).toEqual(['keyword']); + }); + + it('guards perPage: 0, resolving page to 1 rather than failing on NaN', async () => { + const { engine } = fakeEngine(canned); + const result = await run(`{ datasets(perPage: 0) { page total } }`, { + engine, + acceptLanguage: ['nl'], + }); + expect(result.errors).toBeUndefined(); + const data = result.data?.datasets as Record; + expect(data.page).toBe(1); + }); + + it('maps where, orderBy and pagination into the SearchQuery', async () => { const { engine, received } = fakeEngine(canned); await run( `{ @@ -272,7 +418,6 @@ describe('buildSearchSchema', () => { orderBy: { field: SIZE, direction: ASC } page: 3 perPage: 10 - facets: [KEYWORD, PUBLISHER] ) { total } }`, { engine, acceptLanguage: ['nl'] }, @@ -288,7 +433,9 @@ describe('buildSearchSchema', () => { }); expect(query.where).toContainEqual({ field: 'iiif', is: true }); expect(query.orderBy).toEqual([{ field: 'size', direction: 'asc' }]); - expect(query.facets).toEqual(['keyword', 'publisher']); + // Facets are requested per key via selection, not an arg; the listing query + // carries none. + expect(query.facets).toEqual([]); expect(query.limit).toBe(10); expect(query.offset).toBe(20); }); @@ -336,12 +483,15 @@ describe('buildSearchSchema', () => { expect(sdl).toMatch(/publisher: Organization\b(?!!)/); // optional reference }); - it('builds the where, orderBy and facet enums from the field model', () => { + it('builds the where, orderBy enum and keyed facets object from the field model', () => { const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); expect(sdl).toMatch(/enum DatasetSortField/); expect(sdl).toMatch(/RELEVANCE/); expect(sdl).toMatch(/SIZE/); - expect(sdl).toMatch(/enum DatasetFacetField/); + // Facets are a keyed object, one field per facetable field, typed by kind. + expect(sdl).toMatch(/type DatasetFacets/); + expect(sdl).toMatch(/keyword: \[ValueBucket!\]!/); + expect(sdl).toMatch(/size: \[RangeBucket!\]!/); expect(sdl).toMatch(/input DatasetWhere/); expect(sdl).toMatch(/status: StringFilter/); expect(sdl).toMatch(/size: IntRange/); diff --git a/packages/search-api-graphql/vite.config.ts b/packages/search-api-graphql/vite.config.ts index 725cf854..7434ca80 100644 --- a/packages/search-api-graphql/vite.config.ts +++ b/packages/search-api-graphql/vite.config.ts @@ -10,10 +10,10 @@ export default mergeConfig( test: { coverage: { thresholds: { - functions: 90, - lines: 90, - branches: 78, - statements: 90, + functions: 100, + lines: 100, + branches: 88.63, + statements: 100, }, }, }, diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts index 5141f634..37f0d378 100644 --- a/packages/search-typesense/src/collection-schema.ts +++ b/packages/search-typesense/src/collection-schema.ts @@ -114,14 +114,6 @@ function typesenseFields( }); } } - if (names.group !== undefined) { - fields.push({ - name: names.group, - type: valueType, - facet: true, - optional: true, - }); - } return fields; } diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index fc9d4950..662eb393 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -3,6 +3,7 @@ import { fold } from '@lde/text-normalization'; import { physicalFields, searchableFields, + type FacetRange, type Filter, type SearchField, type SearchQuery, @@ -17,9 +18,21 @@ import { * {@link physicalFields}, the same convention the projection and the collection * schema use, so a query can never reference a field the index does not carry. */ +export interface CompileOptions { + /** + * Cap on the number of buckets returned per facet (`max_facet_values`). Left + * unset, Typesense defaults to 10 — too few for high-cardinality facets + * (publisher, keyword), so a deployment with such facets must raise it. Range + * facets return one bucket per declared range regardless, but a value > the + * range count is still safe. + */ + readonly maxFacetValues?: number; +} + export function buildSearchParams( query: SearchQuery, schema: SearchSchema, + options: CompileOptions = {}, ): SearchParams { const folded = query.text !== undefined && query.text.length > 0 @@ -35,7 +48,9 @@ export function buildSearchParams( query_by: names.join(','), query_by_weights: weights.join(','), per_page: query.limit, - page: Math.floor(query.offset / query.limit) + 1, + // A facet-only query (`limit: 0`) fetches no hits; page is then meaningless, + // so pin it to 1 rather than dividing by zero. + page: query.limit > 0 ? Math.floor(query.offset / query.limit) + 1 : 1, }; if (filterBy.length > 0) { params.filter_by = filterBy; @@ -44,11 +59,46 @@ export function buildSearchParams( params.sort_by = sortBy; } if (query.facets.length > 0) { - params.facet_by = query.facets.join(','); + params.facet_by = compileFacetBy(query.facets, schema); + if (options.maxFacetValues !== undefined) { + params.max_facet_values = options.maxFacetValues; + } } return params; } +/** + * The `facet_by` clause. A facet on a numeric field that declares + * {@link SearchField.facetRanges} faceted into those fixed half-open `[min, max)` + * bins (a histogram); every other facet is a plain per-value facet on its field + * name. Typesense range syntax is already start-inclusive/end-exclusive, so the + * declared bounds pass straight through with no boundary fix-up. + */ +function compileFacetBy( + facets: readonly string[], + schema: SearchSchema, +): string { + return facets + .map((name) => { + const field = schema.fields.find((candidate) => candidate.name === name); + return field?.facetRanges !== undefined && field.facetRanges.length > 0 + ? compileRangeFacet(field.name, field.facetRanges) + : name; + }) + .join(','); +} + +/** `name(key:[min, max], …)`; a blank bound is open-ended (Typesense `[75, ]`). */ +function compileRangeFacet( + name: string, + ranges: readonly FacetRange[], +): string { + const bins = ranges + .map((range) => `${range.key}:[${range.min ?? ''}, ${range.max ?? ''}]`) + .join(', '); + return `${name}(${bins})`; +} + /** * The `query_by` fields and aligned weights. Each searchable field expands to its * folded `*_search` companion(s); a localized field’s active-locale companion @@ -116,30 +166,14 @@ function compileFilter( } /** - * A membership clause. A grouped field splits its values into `prefix`-tagged - * group tokens (matched against the `_group` companion) and granular values, and - * ORs the two so selecting a value and a group within one facet unions instead of - * intersecting. A non-facet (tokenized) field uses the exact `:=` operator so an - * IRI cannot partial-match on a shared path segment. + * A membership clause. A non-facet (tokenized) field uses the exact `:=` + * operator so an IRI cannot partial-match on a shared path segment. */ function compileMembership( field: SearchField, values: readonly string[], ): string { const exact = field.facetable !== true; - if (field.group !== undefined) { - const prefix = field.group.prefix; - const groups = values.filter((value) => value.startsWith(prefix)); - const granular = values.filter((value) => !value.startsWith(prefix)); - const parts: string[] = []; - if (granular.length > 0) { - parts.push(membership(field.name, granular, exact)); - } - if (groups.length > 0) { - parts.push(membership(field.group.name, groups, false)); - } - return parts.length > 1 ? `(${parts.join(' || ')})` : parts[0]; - } return membership(field.name, values, exact); } @@ -197,6 +231,6 @@ function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { * (`:`, `/`, `&`, `,`, …) are taken literally instead of parsed as filter syntax. * An embedded backtick is escaped. */ -function escapeFilterValue(value: string): string { +export function escapeFilterValue(value: string): string { return `\`${value.replace(/`/g, '\\`')}\``; } diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index e9d792c7..3e2a9959 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -13,7 +13,7 @@ import { type SearchSchema, type SearchValue, } from '@lde/search'; -import { buildSearchParams } from './query-compiler.js'; +import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; /** Where the engine reads documents and (optionally) reference labels. */ export interface TypesenseSearchEngineOptions { @@ -21,6 +21,26 @@ export interface TypesenseSearchEngineOptions { readonly collection: string; /** The sidecar `labels` collection (IRI → label); omit for id-only references. */ readonly labelsCollection?: string; + /** + * Buckets returned per facet (`max_facet_values`). Typesense defaults to 10; + * raise it for high-cardinality facets (publisher, keyword) so their long + * value lists are not truncated. + */ + readonly maxFacetValues?: number; + /** + * Called when reference-label resolution fails; the search then degrades to + * id-only references rather than failing. Optional — omit to swallow silently. + */ + readonly onLabelError?: (error: unknown) => void; + /** + * Opt-in in-memory label cache. When set (and {@link labelsCollection} is + * set), the FULL sidecar `labels` collection is loaded once via the documents + * export endpoint and held in a process-lifetime cache for this many + * milliseconds; each `search` then resolves its reference labels by in-memory + * lookup instead of a per-search `multi_search` round-trip. Omit to keep the + * per-search {@link fetchLabels} behaviour unchanged. + */ + readonly labelCacheTtlMs?: number; } /** @@ -34,41 +54,137 @@ export function createTypesenseSearchEngine( client: Client, options: TypesenseSearchEngineOptions, ): SearchEngine { + // Process-lifetime cache for the FULL `labels` collection, held in the engine + // closure. Populated lazily on the first cached search; `loadAll` is the + // single-flight in-flight promise so concurrent first-loads share one export. + let cachedLabels: ReadonlyMap | undefined; + let cacheExpiresAt = 0; + let inFlightLoad: Promise> | undefined; + + function cachedAllLabels( + labelsCollection: string, + ttlMs: number, + ): Promise> { + if (cachedLabels !== undefined && Date.now() < cacheExpiresAt) { + return Promise.resolve(cachedLabels); + } + // Single-flight: a load already running serves every concurrent caller. + inFlightLoad ??= loadAllLabels(client, labelsCollection) + .then((loaded) => { + cachedLabels = loaded; + cacheExpiresAt = Date.now() + ttlMs; + return loaded; + }) + // A failed load degrades to id-only references and is NOT cached, so the + // next search retries rather than serving an empty map for the whole TTL. + .catch((error) => { + options.onLabelError?.(error); + return new Map(); + }) + .finally(() => { + inFlightLoad = undefined; + }); + return inFlightLoad; + } + return { async search( query: SearchQuery, schema: SearchSchema, ): Promise { - const params = buildSearchParams(query, schema); + const params = buildSearchParams(query, schema, { + maxFacetValues: options.maxFacetValues, + }); const response = (await client .collections(options.collection) .documents() .search(params)) as TypesenseSearchResponse; - const labels = - options.labelsCollection !== undefined - ? await fetchLabels( + // Labels are supplementary: a failed lookup (e.g. the sidecar collection + // mid-rebuild) degrades to id-only references rather than failing the whole + // search, so the listing still renders with bare IRIs. + let labels: ReadonlyMap = new Map(); + if (options.labelsCollection !== undefined) { + if (options.labelCacheTtlMs !== undefined) { + // Cached path: resolve the page's references by in-memory lookup + // against the once-loaded collection (no Typesense round-trip). + const allLabels = await cachedAllLabels( + options.labelsCollection, + options.labelCacheTtlMs, + ); + labels = selectLabels(allLabels, referenceIris(response, schema)); + } else { + try { + labels = await fetchLabels( client, options.labelsCollection, referenceIris(response, schema), - ) - : new Map(); + ); + } catch (error) { + options.onLabelError?.(error); + } + } + } return parseSearchResponse(response, schema, labels); }, }; } -/** Every distinct reference IRI across the page of hits. */ +/** + * Load the FULL `labels` collection into a label map via the documents export + * endpoint, which streams every document as JSONL (one JSON object per line). + * Each line is reconstructed by {@link labelToLocalizedValue}, exactly as the + * per-search {@link fetchLabels} path does for its `multi_search` hits. + */ +async function loadAllLabels( + client: Pick, + collection: string, +): Promise> { + const jsonl = await client.collections(collection).documents().export(); + const labels = new Map(); + for (const line of jsonl.split('\n')) { + if (line.length === 0) { + continue; + } + const document = JSON.parse(line) as Record; + labels.set(String(document.id), labelToLocalizedValue(document)); + } + return labels; +} + +/** Narrow the cached collection to just the labels `iris` actually need. */ +function selectLabels( + allLabels: ReadonlyMap, + iris: readonly string[], +): Map { + const labels = new Map(); + for (const iri of iris) { + const label = allLabels.get(iri); + if (label !== undefined) { + labels.set(iri, label); + } + } + return labels; +} + +/** Every distinct reference IRI whose label the result will actually use. */ function referenceIris( response: TypesenseSearchResponse, schema: SearchSchema, ): string[] { - const referenceFields = schema.fields + const referenceFieldSet = new Set( + schema.fields + .filter((field) => field.kind === 'reference') + .map((field) => field.name), + ); + // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips + // non-output fields, so resolving a non-output reference's hit labels (e.g. a + // facet-only `class` with dozens of IRIs per hit) is pure waste. + const outputReferenceFields = outputFields(schema) .filter((field) => field.kind === 'reference') .map((field) => field.name); - const referenceFieldSet = new Set(referenceFields); const iris = new Set(); for (const hit of response.hits ?? []) { - for (const name of referenceFields) { + for (const name of outputReferenceFields) { const raw = hit.document[name]; if (Array.isArray(raw)) { for (const value of raw) { @@ -79,7 +195,8 @@ function referenceIris( } } } - // Reference-facet bucket values are IRIs too; resolve them in the same lookup. + // Reference-facet bucket values are IRIs too (incl. facet-only references like + // `class`); resolve them in the same lookup. for (const facet of response.facet_counts ?? []) { if (referenceFieldSet.has(facet.field_name)) { for (const bucket of facet.counts) { @@ -91,33 +208,48 @@ function referenceIris( } /** - * Resolve labels for `iris` from the sidecar `labels` collection in a single - * `filter_by: id:[…]` lookup. Each `label_${locale}` becomes a language-map - * entry; the default `label` is the untagged (`und`) fallback when no locale - * variant exists. + * Resolve labels for `iris` from the sidecar `labels` collection. Each + * `label_${locale}` becomes a language-map entry; the default `label` is the + * untagged (`und`) fallback when no locale variant exists. + * + * Sent over `multi_search` (POST) in batches: the id-list of a page or facet + * carrying many references — e.g. a dataset with dozens of classes — would + * overflow Typesense’s GET query-string limit (4000 chars, and IRIs URL-encode + * to several times their length) if it travelled in the URL. POST puts it in the + * body; the batch size stays under Typesense’s `per_page` cap. Exported for + * unit testing against a fake client. */ -async function fetchLabels( - client: Client, +export async function fetchLabels( + client: Pick, collection: string, iris: readonly string[], ): Promise> { const labels = new Map(); - if (iris.length === 0) { - return labels; - } - const filter = `id:[${iris.map((iri) => `\`${iri.replace(/`/g, '\\`')}\``).join(',')}]`; - const response = (await client.collections(collection).documents().search({ - q: '*', - query_by: 'label', - filter_by: filter, - per_page: iris.length, - })) as TypesenseSearchResponse; - for (const hit of response.hits ?? []) { - labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + for (let start = 0; start < iris.length; start += LABEL_BATCH_SIZE) { + const batch = iris.slice(start, start + LABEL_BATCH_SIZE); + const filter = `id:[${batch.map(escapeFilterValue).join(',')}]`; + const { results } = (await client.multiSearch.perform({ + searches: [ + { + collection, + q: '*', + query_by: 'label', + filter_by: filter, + per_page: batch.length, + }, + ], + })) as { results: readonly TypesenseSearchResponse[] }; + for (const hit of results[0]?.hits ?? []) { + labels.set(String(hit.document.id), labelToLocalizedValue(hit.document)); + } } return labels; } +/** Typesense caps `per_page` at 250; the multi_search POST body holds the + * id-list comfortably, so resolve references in batches of this size. */ +const LABEL_BATCH_SIZE = 200; + /** Turn a `labels` document into a language map (`label_${locale}` → locale). */ function labelToLocalizedValue( document: Record, @@ -173,11 +305,25 @@ export function parseSearchResponse( const facets: Record = {}; for (const facet of response.facet_counts ?? []) { const labelled = referenceFacets.has(facet.field_name); + // A range facet echoes the declared range key as the bucket value; look the + // bin's half-open bounds back up by key so the bucket is self-describing. + const field = schema.fields.find( + (candidate) => candidate.name === facet.field_name, + ); + const rangesByKey = + field?.facetRanges !== undefined + ? new Map(field.facetRanges.map((range) => [range.key, range])) + : undefined; facets[facet.field_name] = facet.counts.map((bucket) => { const label = labelled ? labels.get(bucket.value) : undefined; - return label === undefined - ? { value: bucket.value, count: bucket.count } - : { value: bucket.value, count: bucket.count, label }; + const range = rangesByKey?.get(bucket.value); + return { + value: bucket.value, + count: bucket.count, + ...(label !== undefined ? { label } : {}), + ...(range?.min !== undefined ? { min: range.min } : {}), + ...(range?.max !== undefined ? { max: range.max } : {}), + }; }); } return { hits, total: response.found, facets }; diff --git a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap index 201512f7..e56c6447 100644 --- a/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap +++ b/packages/search-typesense/test/__snapshots__/generator-stability.test.ts.snap @@ -63,12 +63,6 @@ exports[`collection-schema generator stability > derives a stable Typesense coll "sort": false, "type": "string[]", }, - { - "facet": true, - "name": "format_group", - "optional": true, - "type": "string[]", - }, { "facet": true, "name": "creator", diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts index 51511122..8d82507d 100644 --- a/packages/search-typesense/test/collection-schema.test.ts +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -30,7 +30,6 @@ const schema: SearchSchema = { kind: 'keyword', array: true, facetable: true, - group: { name: 'format_group', prefix: 'group:' }, }, // Derived fields (no path) still get collection fields — populated at index // time by derivations, not projected. @@ -190,13 +189,4 @@ describe('buildCollectionSchema', () => { locale: 'nl', }); }); - - it('emits the grouped-facet companion for a field that declares a group', () => { - expect(collection.fields).toContainEqual({ - name: 'format_group', - type: 'string[]', - facet: true, - optional: true, - }); - }); }); diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts index 2383ecde..bb7eca2a 100644 --- a/packages/search-typesense/test/generator-stability.test.ts +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -6,8 +6,8 @@ import { buildCollectionSchema } from '../src/collection-schema.js'; * A neutral fixture exercising every kind + capability — NOT a real domain. The * derived Typesense collection is snapshotted purely to pin the **generator**: * any change to how `buildCollectionSchema` maps the field model (Typesense field - * types, the physical fanout, stem/locale, optional/default-sorting-field, group - * companions) surfaces as a snapshot diff before this library is published. + * types, the physical fanout, stem/locale, optional/default-sorting-field) + * surfaces as a snapshot diff before this library is published. */ const THING: SearchSchema = { type: 'https://example.org/Thing', @@ -35,7 +35,6 @@ const THING: SearchSchema = { array: true, facetable: true, filterable: true, - group: { name: 'format_group', prefix: 'group:' }, }, { name: 'creator', diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index 50e601a4..55a09bdd 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -1,6 +1,11 @@ -import { describe, expect, it } from 'vitest'; -import type { LocalizedValue, SearchSchema } from '@lde/search'; -import { parseSearchResponse } from '../src/search.js'; +import { afterEach, describe, expect, it, vi } from 'vitest'; +import type { LocalizedValue, SearchQuery, SearchSchema } from '@lde/search'; +import type { Client } from 'typesense'; +import { + createTypesenseSearchEngine, + fetchLabels, + parseSearchResponse, +} from '../src/search.js'; const schema: SearchSchema = { type: 'http://www.w3.org/ns/dcat#Dataset', @@ -143,3 +148,318 @@ describe('parseSearchResponse', () => { expect(result.hits[0].document.status).toBeUndefined(); }); }); + +describe('parseSearchResponse range facets', () => { + const rangeSchema: SearchSchema = { + type: 'http://www.w3.org/ns/dcat#Dataset', + fields: [ + { + name: 'size', + kind: 'integer', + facetable: true, + output: true, + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + // Open-ended top bin: no upper bound. + { key: '2', min: 100 }, + ], + }, + ], + }; + + const rangeResponse = { + found: 5, + hits: [], + facet_counts: [ + { + field_name: 'size', + counts: [ + { value: '0', count: 2 }, + { value: '1', count: 1 }, + { value: '2', count: 2 }, + ], + }, + ], + }; + + it('echoes each range bin’s half-open bounds onto its bucket, open ends omitted', () => { + const result = parseSearchResponse(rangeResponse, rangeSchema, new Map()); + expect(result.facets.size).toEqual([ + { value: '0', count: 2, min: 1, max: 10 }, + { value: '1', count: 1, min: 10, max: 100 }, + // The open-ended top bin carries only its lower bound. + { value: '2', count: 2, min: 100 }, + ]); + }); +}); + +describe('createTypesenseSearchEngine label degradation', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // A fake client whose document search succeeds but whose label lookup + // (multi_search) rejects, so the engine must degrade to id-only references. + function fakeClient(): Client { + return { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + }), + }), + multiSearch: { + perform: () => + Promise.reject(new Error('labels collection unavailable')), + }, + } as unknown as Client; + } + + it('degrades to id-only references when the label lookup fails, reporting the cause', async () => { + let capturedError: unknown; + const engine = createTypesenseSearchEngine(fakeClient(), { + collection: 'datasets', + labelsCollection: 'labels', + onLabelError: (error) => { + capturedError = error; + }, + }); + const result = await engine.search(baseQuery, schema); + // The reference is present but unlabelled: the failed lookup degraded + // rather than failing the whole search. + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + }); +}); + +describe('createTypesenseSearchEngine label cache (labelCacheTtlMs)', () => { + const baseQuery: SearchQuery = { + where: [], + orderBy: [], + limit: 10, + offset: 0, + facets: [], + locale: 'nl', + }; + + // One labels document, as the export endpoint streams it (JSONL). + const labelsJsonl = JSON.stringify({ + id: 'https://org/1', + label: 'Het Utrechts Archief', + label_nl: 'Het Utrechts Archief', + }); + + // A fake client whose document search always returns one hit referencing + // `https://org/1`, and whose `labels` collection export is driven by + // `exportImpl`. Counters make the export-call count observable. + function fakeClient(exportImpl: () => Promise) { + let exportCalls = 0; + const client = { + collections: () => ({ + documents: () => ({ + search: () => + Promise.resolve({ + found: 1, + hits: [ + { + document: { id: 'https://d/1', publisher: ['https://org/1'] }, + }, + ], + }), + export: () => { + exportCalls += 1; + return exportImpl(); + }, + }), + }), + }; + return { + client: client as unknown as Client, + exportCalls: () => exportCalls, + }; + } + + afterEach(() => { + vi.useRealTimers(); + }); + + it('loads the collection once for concurrent searches (single-flight)', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + const results = await Promise.all([ + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + engine.search(baseQuery, schema), + ]); + + // One export served all three concurrent searches. + expect(exportCalls()).toBe(1); + for (const result of results) { + expect(result.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + } + }); + + it('serves a later search from cache without a second export', async () => { + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + }); + + await engine.search(baseQuery, schema); + await engine.search(baseQuery, schema); + + expect(exportCalls()).toBe(1); + }); + + it('reloads the collection after the TTL expires', async () => { + vi.useFakeTimers(); + const { client, exportCalls } = fakeClient(() => + Promise.resolve(labelsJsonl), + ); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 1000, + }); + + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Within the TTL: still cached. + vi.advanceTimersByTime(500); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(1); + + // Past the TTL: reload. + vi.advanceTimersByTime(600); + await engine.search(baseQuery, schema); + expect(exportCalls()).toBe(2); + }); + + it('degrades to id-only references on a load error and retries next time', async () => { + let capturedError: unknown; + let attempt = 0; + const { client, exportCalls } = fakeClient(() => { + attempt += 1; + return attempt === 1 + ? Promise.reject(new Error('labels collection unavailable')) + : Promise.resolve(labelsJsonl); + }); + const engine = createTypesenseSearchEngine(client, { + collection: 'datasets', + labelsCollection: 'labels', + labelCacheTtlMs: 60_000, + onLabelError: (error) => { + capturedError = error; + }, + }); + + // First load fails: id-only reference, error reported, nothing cached. + const failed = await engine.search(baseQuery, schema); + expect(failed.hits[0].document.publisher).toEqual([ + { id: 'https://org/1' }, + ]); + expect(capturedError).toBeInstanceOf(Error); + expect(exportCalls()).toBe(1); + + // Next search retries the load (the failure was not cached) and resolves. + const recovered = await engine.search(baseQuery, schema); + expect(recovered.hits[0].document.publisher).toEqual([ + { id: 'https://org/1', label: { nl: ['Het Utrechts Archief'] } }, + ]); + expect(exportCalls()).toBe(2); + }); +}); + +describe('fetchLabels', () => { + // A fake Typesense client whose multi_search returns the requested ids that + // exist in `docsById`, recording the id-list of each POST so batching is + // observable. (Resolving via multi_search/POST avoids the GET query-string + // limit that a large id-list would otherwise overflow.) + function fakeClient(docsById: Record>) { + const calls: string[][] = []; + const client = { + multiSearch: { + perform: (request: { searches: { readonly filter_by: string }[] }) => { + const ids = [ + ...request.searches[0].filter_by.matchAll(/`([^`]+)`/g), + ].map((match) => match[1]); + calls.push(ids); + const hits = ids + .filter((id) => docsById[id] !== undefined) + .map((id) => ({ document: { id, ...docsById[id] } })); + return Promise.resolve({ results: [{ found: hits.length, hits }] }); + }, + }, + }; + return { client: client as unknown as Pick, calls }; + } + + it('resolves labels via multi_search, merging per-locale variants', async () => { + const { client, calls } = fakeClient({ + 'https://org/1': { label: 'KB', label_nl: 'KB' }, + // Only a default label (no locale variant) → untagged (`und`) fallback. + 'https://org/3': { label: 'Untagged' }, + }); + const labels = await fetchLabels(client, 'labels', [ + 'https://org/1', + 'https://org/2', + 'https://org/3', + ]); + expect(labels.get('https://org/1')).toEqual({ nl: ['KB'] }); + expect(labels.get('https://org/3')).toEqual({ und: ['Untagged'] }); + // An IRI absent from the collection yields no entry. + expect(labels.has('https://org/2')).toBe(false); + expect(calls).toHaveLength(1); + }); + + it('batches a large id-list under the per_page cap, one POST per batch', async () => { + const ids = Array.from( + { length: 450 }, + (_unused, index) => `https://example.org/class/${index}`, + ); + const docsById = Object.fromEntries( + ids.map((id) => [id, { label_nl: id }]), + ); + const { client, calls } = fakeClient(docsById); + const labels = await fetchLabels(client, 'labels', ids); + // 450 ids → batches of 200, 200, 50. + expect(calls.map((batch) => batch.length)).toEqual([200, 200, 50]); + expect(labels.size).toBe(450); + }); + + it('makes no request for an empty id-list', async () => { + const { client, calls } = fakeClient({}); + const labels = await fetchLabels(client, 'labels', []); + expect(labels.size).toBe(0); + expect(calls).toHaveLength(0); + }); +}); diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index acdd9f7a..6556e7b3 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -30,12 +30,23 @@ const schema: SearchSchema = { array: true, facetable: true, filterable: true, - group: { name: 'format_group', prefix: 'group:' }, }, // Filter-only, non-facet (tokenized) → exact `:=` membership. { name: 'catalog', kind: 'keyword', array: true, filterable: true }, { name: 'status', kind: 'keyword', facetable: true, filterable: true }, - { name: 'size', kind: 'integer', filterable: true, sortable: true }, + { + name: 'size', + kind: 'integer', + filterable: true, + sortable: true, + facetable: true, + // Half-open `[min, max)` bins; the last is open-ended (no upper bound). + facetRanges: [ + { key: '0', min: 1, max: 10 }, + { key: '1', min: 10, max: 100 }, + { key: '2', min: 100 }, + ], + }, { name: 'iiif', kind: 'boolean', filterable: true, facetable: true }, ], }; @@ -78,7 +89,7 @@ describe('buildSearchParams', () => { ).toBe(3); }); - it('compiles where clauses, with exact membership for non-facet fields and grouped OR', () => { + it('compiles where clauses, with exact membership for non-facet fields', () => { const params = buildSearchParams( { ...base, @@ -97,7 +108,7 @@ describe('buildSearchParams', () => { 'status:[`valid`] && ' + 'keyword:[`kaarten`,`atlas`] && ' + 'catalog:=[`urn:cat`] && ' + - '(format:[`text/turtle`] || format_group:[`group:rdf`]) && ' + + 'format:[`text/turtle`,`group:rdf`] && ' + 'size:[1..10] && ' + 'iiif:=true', ); @@ -147,10 +158,44 @@ describe('buildSearchParams', () => { ).toBe('title_sort_nl:asc,status_rank:asc'); }); + it('pins page to 1 for a facet-only (limit:0) query instead of dividing by zero', () => { + const params = buildSearchParams({ ...base, limit: 0 }, schema); + expect(params.per_page).toBe(0); + expect(params.page).toBe(1); + }); + it('requests facets by their logical field name', () => { expect( buildSearchParams({ ...base, facets: ['keyword', 'format'] }, schema) .facet_by, ).toBe('keyword,format'); }); + + it('facets a range field into its declared half-open bins, open ends blank', () => { + // Typesense range syntax is start-inclusive/end-exclusive, so the declared + // `[min, max)` bounds pass straight through; the open-ended bin leaves the + // upper bound blank. + expect( + buildSearchParams({ ...base, facets: ['size'] }, schema).facet_by, + ).toBe('size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('mixes range and plain facets in one facet_by clause', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword', 'size'] }, schema) + .facet_by, + ).toBe('keyword,size(0:[1, 10], 1:[10, 100], 2:[100, ])'); + }); + + it('omits max_facet_values by default but sets it when configured', () => { + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema) + .max_facet_values, + ).toBeUndefined(); + expect( + buildSearchParams({ ...base, facets: ['keyword'] }, schema, { + maxFacetValues: 250, + }).max_facet_values, + ).toBe(250); + }); }); diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index a09c9579..9184cdbe 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -16,10 +16,10 @@ export default mergeConfig( // rethrow guards and best-effort cleanup paths are deliberately not // exercised, which is why branch coverage is lower. thresholds: { - functions: 87.5, - lines: 84.7, - branches: 66.66, - statements: 84.88, + functions: 97.14, + lines: 93.28, + branches: 83.75, + statements: 93.37, }, }, }, diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index 59284d7f..bcf61657 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -4,8 +4,9 @@ import type { SearchSchema } from './schema.js'; /** * The engine port — the boundary a concrete engine adapter (e.g. * `@lde/search-typesense`’s `TypesenseSearchEngine`) implements. The adapter - * owns every engine specific (companion-field expansion, `query_by`/weights, the - * filter compiler, `sort_by`, folding, `facet_by`) and returns only logical + * owns every engine specific (companion-field expansion, full-text field + * selection and weights, filter compilation, sorting, result folding, faceting) + * and returns only logical * documents, so a deployment can swap engines without any consumer noticing. * Nothing engine-specific and nothing RDF-specific leaks past this port. * @@ -137,4 +138,12 @@ export interface FacetBucket { readonly value: string; readonly count: number; readonly label?: LocalizedValue; + /** + * For a range-facet bucket: its half-open bounds (`min` inclusive, `max` + * exclusive), echoing the declared {@link FacetRange} so the bucket is + * self-describing and a consumer never hardcodes the bin formula. Both absent + * for a value facet; either absent for an open-ended bin. + */ + readonly min?: number; + readonly max?: number; } diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index cb02290e..37bc4db3 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -21,6 +21,7 @@ export type { SearchSchema, Derivation, PhysicalFields, + FacetRange, } from './schema.js'; // Engine- and protocol-neutral query IR + filter semantics. diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 2873d99c..41ed5356 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -30,11 +30,10 @@ export type FieldKind = * are independent opt-ins: a field exposes exactly the roles it declares. A * field with no `path` is a **derived field** — populated by a * {@link Derivation} rather than projected from the IR — yet it still carries - * full query/schema/output behavior (e.g. `status`, the `*_group` companions, - * the compatibility booleans). + * full query/schema/output behavior (e.g. `status`, the compatibility booleans). * * The physical field names a declaration fans out to (per-locale search/sort - * keys, the grouped-facet companion, …) follow one convention, owned by + * keys) follow one convention, owned by * {@link physicalFields} so projection, collection-schema and query compiler * cannot disagree. */ @@ -74,13 +73,32 @@ export interface SearchField { }; /** Projection-time value transform (e.g. strip a media-type prefix). */ readonly transform?: (value: string) => string; - /** Grouped-facet companion (a coarse `${name}_group`; deployment delta). */ - readonly group?: { readonly name: string; readonly prefix: string }; + /** + * Range-facet bins for a numeric (`integer`/`number`/`date`) facetable field. + * When set, the field facets into these fixed half-open `[min, max)` ranges (a + * histogram) rather than one bucket per distinct value — the per-bucket counts + * a UI slider needs. Bins are query-time only (no index impact) and + * engine-neutral: the Typesense adapter emits a `facet_by` range, an + * Elasticsearch adapter a `range` aggregation. See {@link FacetRange}. + */ + readonly facetRanges?: readonly FacetRange[]; +} + +/** + * One half-open `[min, max)` range-facet bin: `min` inclusive, `max` exclusive, + * so contiguous bins partition cleanly with no boundary double-counting. Omit + * `min` (or `max`) for an open-ended bin (`< max`, resp. `≥ min`). `key` is the + * bucket’s stable label, echoed back as the {@link FacetBucket} `value`. + */ +export interface FacetRange { + readonly key: string; + readonly min?: number; + readonly max?: number; } /** * A computed field that is not a direct projection of a single path — a status - * rank, a `*_group` derived from a code table, a compatibility boolean. Reads + * rank, a compatibility boolean. Reads * the framed node and writes onto the flat document the field specs already * populated. */ @@ -116,8 +134,6 @@ export interface PhysicalFields { /** Per-locale folded sort keys `${name}_sort_${locale}` (localized text, * `sortable`); a non-localized field sorts on its `value`. */ readonly sort: readonly string[]; - /** The grouped-facet companion `${name}_group`, when `group` is declared. */ - readonly group?: string; } /** @@ -179,6 +195,5 @@ export function physicalFields(field: SearchField): PhysicalFields { localized && field.sortable ? locales.map((locale) => `${field.name}_sort_${locale}`) : [], - group: field.group ? `${field.name}_group` : undefined, }; } diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index bd52d449..08ab0fd5 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -101,27 +101,6 @@ describe('physicalFields', () => { }); }); - it('adds the `${name}_group` companion when a field declares a group', () => { - const format: SearchField = { - name: 'format', - kind: 'keyword', - array: true, - facetable: true, - group: { - name: 'format_group', - prefix: 'https://www.iana.org/assignments/media-types/', - }, - }; - - expect(physicalFields(format)).toEqual({ - value: 'format', - display: [], - search: [], - sort: [], - group: 'format_group', - }); - }); - it('emits only the search keys for a search-only localized field (no display, no sort)', () => { const creator: SearchField = { name: 'creator', diff --git a/packages/search/vite.config.ts b/packages/search/vite.config.ts index 6a8321a2..915a945a 100644 --- a/packages/search/vite.config.ts +++ b/packages/search/vite.config.ts @@ -11,9 +11,9 @@ export default mergeConfig( coverage: { thresholds: { functions: 100, - lines: 97.3, - branches: 88.76, - statements: 97.3, + lines: 97.84, + branches: 90.9, + statements: 97.91, }, }, }, From 4cb1e384bd5b55e1f99d281dd03377042e1705e2 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 12:13:55 +0200 Subject: [PATCH 08/13] build(deps): add @lde/search-api-graphql to the lockfile and refresh @lde/* pins npm ci failed because the lockfile lacked the new @lde/search-api-graphql workspace. Regenerating against npmjs adds it and brings ~24 @lde/* internal deps up to their latest in-range patches; no third-party or duplicate-version changes. --- package-lock.json | 139 +++++++++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 62 deletions(-) diff --git a/package-lock.json b/package-lock.json index e1a4b8d8..6abec10e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24953,6 +24953,10 @@ "resolved": "packages/search", "link": true }, + "node_modules/@lde/search-api-graphql": { + "resolved": "packages/search-api-graphql", + "link": true + }, "node_modules/@lde/search-typesense": { "resolved": "packages/search-typesense", "link": true @@ -33133,7 +33137,6 @@ "version": "15.10.2", "resolved": "https://registry.npmjs.org/graphql/-/graphql-15.10.2.tgz", "integrity": "sha512-1PRqdDPAmViWr4h1GVBT8RoPZfWSGZa7kDzleTilOfVIslsgf+cia3Nl95v1KDmR4iERPaT7WzQ+tN4MJmbg3w==", - "dev": true, "license": "MIT", "engines": { "node": ">= 10.x" @@ -41008,7 +41011,7 @@ }, "packages/dataset": { "name": "@lde/dataset", - "version": "0.7.7", + "version": "0.7.8", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -41016,10 +41019,10 @@ }, "packages/dataset-registry-client": { "name": "@lde/dataset-registry-client", - "version": "0.8.4", + "version": "0.8.5", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "@traqula/generator-sparql-1-1": "^1.1.6", "@traqula/parser-sparql-1-1": "^1.1.5", "@traqula/rules-sparql-1-1": "^1.1.0", @@ -41032,34 +41035,34 @@ }, "packages/distribution-downloader": { "name": "@lde/distribution-downloader", - "version": "0.6.5", + "version": "0.6.6", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "filenamify-url": "4.0.0", "tslib": "^2.3.0" } }, "packages/distribution-health": { "name": "@lde/distribution-health", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/distribution-probe": "0.2.1", - "@lde/sparql-importer": "0.6.5", + "@lde/distribution-probe": "^0.2.2", + "@lde/sparql-importer": "^0.6.5", "tslib": "^2.3.0" }, "devDependencies": { - "@lde/dataset": "0.7.7" + "@lde/dataset": "^0.7.7" } }, "packages/distribution-monitor": { "name": "@lde/distribution-monitor", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-probe": "0.2.1", + "@lde/dataset": "^0.7.8", + "@lde/distribution-probe": "^0.2.2", "c12": "^3.3.4", "commander": "^15.0.0", "cron": "^4.1.0", @@ -41086,10 +41089,10 @@ }, "packages/distribution-probe": { "name": "@lde/distribution-probe", - "version": "0.2.1", + "version": "0.2.3", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", + "@lde/dataset": "^0.7.8", "rdf-parse": "^5.0.0", "tslib": "^2.3.0" } @@ -41756,7 +41759,7 @@ }, "packages/docgen": { "name": "@lde/docgen", - "version": "0.6.18", + "version": "0.6.19", "license": "MIT", "dependencies": { "@tpluscode/rdf-ns-builders": "^5.0.0", @@ -41786,7 +41789,7 @@ }, "packages/fastify-rdf": { "name": "@lde/fastify-rdf", - "version": "0.4.6", + "version": "0.4.7", "license": "MIT", "dependencies": { "@fastify/accepts": "^5.0.0", @@ -42483,7 +42486,7 @@ }, "packages/iiif-validator": { "name": "@lde/iiif-validator", - "version": "0.1.4", + "version": "0.1.5", "license": "MIT", "dependencies": { "@iiif/parser": "^2.2.10", @@ -42492,7 +42495,7 @@ }, "packages/local-sparql-endpoint": { "name": "@lde/local-sparql-endpoint", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { "jest-dev-server": "11.0.0", @@ -42505,15 +42508,15 @@ }, "packages/pipeline": { "name": "@lde/pipeline", - "version": "0.31.3", + "version": "0.31.5", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/dataset-registry-client": "0.8.4", - "@lde/distribution-health": "0.2.1", - "@lde/distribution-probe": "0.2.1", - "@lde/sparql-importer": "0.6.5", - "@lde/sparql-server": "0.4.11", + "@lde/dataset": "^0.7.7", + "@lde/dataset-registry-client": "^0.8.4", + "@lde/distribution-health": "^0.2.2", + "@lde/distribution-probe": "^0.2.2", + "@lde/sparql-importer": "^0.6.5", + "@lde/sparql-server": "^0.4.11", "@rdfjs/namespace": "^2.0.1", "@rdfjs/types": "^2.0.1", "@tpluscode/rdf-ns-builders": "^5.0.0", @@ -42531,7 +42534,7 @@ }, "packages/pipeline-console-reporter": { "name": "@lde/pipeline-console-reporter", - "version": "0.22.3", + "version": "0.22.5", "license": "MIT", "dependencies": { "chalk": "^5.4.1", @@ -42541,8 +42544,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-console-reporter/node_modules/ansi-regex": { @@ -42722,7 +42725,7 @@ }, "packages/pipeline-shacl-sampler": { "name": "@lde/pipeline-shacl-sampler", - "version": "0.5.3", + "version": "0.5.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42732,8 +42735,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-shacl-sampler/node_modules/n3": { @@ -42751,7 +42754,7 @@ }, "packages/pipeline-shacl-validator": { "name": "@lde/pipeline-shacl-validator", - "version": "0.13.3", + "version": "0.13.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42764,8 +42767,8 @@ "n3": "^2.1.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-shacl-validator/node_modules/n3": { @@ -42784,7 +42787,7 @@ }, "packages/pipeline-void": { "name": "@lde/pipeline-void", - "version": "0.29.3", + "version": "0.29.5", "license": "MIT", "dependencies": { "@rdfjs/types": "^2.0.1", @@ -42794,8 +42797,8 @@ "tslib": "^2.3.0" }, "peerDependencies": { - "@lde/dataset": "0.7.7", - "@lde/pipeline": "0.31.3" + "@lde/dataset": "^0.7.8", + "@lde/pipeline": "^0.31.4" } }, "packages/pipeline-void/node_modules/n3": { @@ -42852,10 +42855,10 @@ }, "packages/search": { "name": "@lde/search", - "version": "0.1.1", + "version": "0.1.2", "license": "MIT", "dependencies": { - "@lde/text-normalization": "0.1.0", + "@lde/text-normalization": "^0.1.1", "@rdfjs/types": "^2.0.1", "@tpluscode/rdf-ns-builders": "^5.0.0", "jsonld": "^9.0.0", @@ -42866,11 +42869,23 @@ "n3": "^2.1.0" } }, + "packages/search-api-graphql": { + "name": "@lde/search-api-graphql", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "@lde/search": "^0.1.2", + "graphql": "^15.8.0", + "tslib": "^2.3.0" + } + }, "packages/search-typesense": { "name": "@lde/search-typesense", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { + "@lde/search": "^0.1.2", + "@lde/text-normalization": "^0.1.1", "tslib": "^2.3.0", "typesense": "^3.0.6" }, @@ -42894,28 +42909,28 @@ }, "packages/sparql-importer": { "name": "@lde/sparql-importer", - "version": "0.6.5", + "version": "0.6.6", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-downloader": "0.6.5", - "@lde/task-runner": "0.2.11", + "@lde/dataset": "^0.7.8", + "@lde/distribution-downloader": "^0.6.5", + "@lde/task-runner": "^0.2.11", "tslib": "^2.3.0" } }, "packages/sparql-qlever": { "name": "@lde/sparql-qlever", - "version": "0.14.10", + "version": "0.14.11", "license": "MIT", "dependencies": { - "@lde/dataset": "0.7.7", - "@lde/distribution-downloader": "0.6.5", - "@lde/sparql-importer": "0.6.5", - "@lde/sparql-server": "0.4.11", - "@lde/task-runner": "0.2.11", - "@lde/task-runner-docker": "0.2.13", - "@lde/task-runner-native": "0.2.14", - "@lde/wait-for-sparql": "0.2.13", + "@lde/dataset": "^0.7.8", + "@lde/distribution-downloader": "^0.6.5", + "@lde/sparql-importer": "^0.6.5", + "@lde/sparql-server": "^0.4.11", + "@lde/task-runner": "^0.2.11", + "@lde/task-runner-docker": "^0.2.13", + "@lde/task-runner-native": "^0.2.14", + "@lde/wait-for-sparql": "^0.2.13", "rdf-parse": "^5.0.0", "rdf-serialize": "^5.1.0", "tslib": "^2.3.0", @@ -43620,7 +43635,7 @@ }, "packages/sparql-server": { "name": "@lde/sparql-server", - "version": "0.4.11", + "version": "0.4.12", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43628,7 +43643,7 @@ }, "packages/task-runner": { "name": "@lde/task-runner", - "version": "0.2.11", + "version": "0.2.12", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43636,10 +43651,10 @@ }, "packages/task-runner-docker": { "name": "@lde/task-runner-docker", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { - "@lde/task-runner": "0.2.11", + "@lde/task-runner": "^0.2.12", "dockerode": "^5.0.1", "tslib": "^2.3.0" }, @@ -43649,16 +43664,16 @@ }, "packages/task-runner-native": { "name": "@lde/task-runner-native", - "version": "0.2.14", + "version": "0.2.15", "license": "MIT", "dependencies": { - "@lde/task-runner": "0.2.11", + "@lde/task-runner": "^0.2.12", "tslib": "^2.3.0" } }, "packages/text-normalization": { "name": "@lde/text-normalization", - "version": "0.1.0", + "version": "0.1.1", "license": "MIT", "dependencies": { "tslib": "^2.3.0" @@ -43666,7 +43681,7 @@ }, "packages/wait-for-sparql": { "name": "@lde/wait-for-sparql", - "version": "0.2.13", + "version": "0.2.14", "license": "MIT", "dependencies": { "fetch-sparql-endpoint": "^7.1.1", From c1f19ad7e40c5d2944a1d0a22b1dc1de5ceae2be Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 12:18:10 +0200 Subject: [PATCH 09/13] fix(search-typesense): narrow possibly-undefined facet buckets in the search-engine test `result.facets` is a `Partial` record, so a facet is `FacetBucket[] | undefined`; guard the two spreads with `?? []` so the `typecheck` target passes (it never ran in CI before the lockfile fix). --- packages/search-typesense/test/search-engine.test.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 3a392f8a..8847d2d7 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -198,7 +198,7 @@ describe('createTypesenseSearchEngine (integration)', () => { ); // Plain facet: value + count, no label. - const keyword = [...result.facets.keyword].sort( + const keyword = [...(result.facets.keyword ?? [])].sort( (a, b) => b.count - a.count, ); expect(keyword).toEqual([ @@ -207,7 +207,7 @@ describe('createTypesenseSearchEngine (integration)', () => { ]); // Reference facet: IRI-keyed buckets carry the resolved data label. - const publisher = [...result.facets.publisher].sort( + const publisher = [...(result.facets.publisher ?? [])].sort( (a, b) => b.count - a.count, ); expect(publisher).toEqual([ From 5b9e68596b4956a49302c74d83c9aba0d27d035a Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 1 Jul 2026 15:55:10 +0200 Subject: [PATCH 10/13] docs(search): state ADR 3 design directly, without dated update annotations Fold the unified-field-model blockquote and the dated Consequences bullet into running text, so the ADR reads as the current design rather than a change log. --- .../decisions/0003-search-api-core-query-model.md | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 38f9e697..09e08a4a 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -49,13 +49,11 @@ NodeShape + its `search:` annotations. **One `SearchField` declaration drives fo consumers** – projection (RDF→flat document), the engine collection schema, the query semantics, and the GraphQL surface – so they cannot drift. -> Updated 2026-06-26 (during implementation): this is the **unified** field model. It -> folds the three previously separate declarations into one – the projection-side -> `FieldSpec`/`FieldKind` (RDF→doc), the deployment’s Typesense `SEARCH_FIELDS` (collection -> schema + weights), and the query model below. The original ADR deferred this unification; -> it is now adopted (option “c”). The `kind` + capability flags replace the old discriminated -> projection kinds, derived fields become first-class, and the Typesense-vocabulary types are -> _derived_ from `kind` rather than re-declared. +It is a **unified** model: one declaration in place of three otherwise-separate ones – the +projection-side `FieldSpec`/`FieldKind`, the Typesense `SEARCH_FIELDS` (collection schema + +weights), and the query model below. `kind` plus capability flags replace the discriminated +projection kinds, derived fields are first-class, and the Typesense-vocabulary types are +_derived_ from `kind` rather than re-declared. ```ts type FieldKind = @@ -318,9 +316,6 @@ not enabled for DR v1, more relevant for B/C. - Carried through: the Stable API Contract discipline, the reference `strategy` concept, the surface `LanguageString` list, folding at the adapter boundary + query side via `@lde/text-normalization`, SDL-in-projection vs filter-compiler-in-adapter. -- Adopted during implementation (2026-06-26): the **unified** field model – the projection - `FieldSpec` (RDF→doc) and the deployment’s Typesense `SEARCH_FIELDS` are folded into this - one `SearchField` (see the Field model note above). - Deferred: REST surface; framed-JSON-LD materialised view (nested storage, index-time label inlining, detail-page-on-index, terms-collection split); semantic/hybrid (vector) search. From 90a0970f8ff20caa10f4c2c84407c6bb10454cbf Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:46:42 +0200 Subject: [PATCH 11/13] feat(search)!: rename the per-type SearchSchema to SearchType - SearchType is one root type declaration (one SHACL NodeShape, one GraphQL object type); SearchSchema now names the whole search declaration: a ReadonlyMap of SearchTypes keyed by type IRI, built with the new searchSchema() factory - projectGraph now consumes a SearchSchema instead of a SearchType array - rename buildSearchSchema / printSearchSchema / BuildSearchSchemaOptions to buildGraphQLSchema / printGraphQLSchema / BuildGraphQLSchemaOptions: they construct a GraphQLSchema rather than the SearchSchema the old names implied - rename schema parameters to searchType where they take one type, and the FacetFieldsOf/OutputFieldsOf/EngineFor/ResultFor generic from Schema to Type - add a Terminology section to the @lde/search README mapping SearchField / SearchType / SearchSchema onto SHACL and GraphQL; update ADRs 3 and 4, the package READMEs and npm descriptions - drop section-divider comments in build-schema.ts and stale grouped-facet mentions in the READMEs BREAKING CHANGE: the per-type interface SearchSchema is renamed to SearchType, and SearchSchema now denotes the type-keyed map built with searchSchema(). projectGraph(quads, types[]) becomes projectGraph(quads, searchSchema(...types)). In @lde/search-api-graphql, buildSearchSchema, printSearchSchema and BuildSearchSchemaOptions are renamed to buildGraphQLSchema, printGraphQLSchema and BuildGraphQLSchemaOptions. --- .../0003-search-api-core-query-model.md | 10 ++-- .../0004-search-api-graphql-surface.md | 28 +++++----- packages/search-api-graphql/README.md | 23 ++++---- packages/search-api-graphql/package.json | 2 +- .../search-api-graphql/src/build-schema.ts | 52 +++++++++---------- packages/search-api-graphql/src/index.ts | 4 +- .../test/build-schema.test.ts | 20 ++++--- .../test/generator-stability.test.ts | 10 ++-- packages/search-typesense/README.md | 6 +-- .../search-typesense/src/collection-schema.ts | 12 ++--- .../search-typesense/src/query-compiler.ts | 38 ++++++++------ packages/search-typesense/src/search.ts | 30 +++++------ .../test/collection-schema.test.ts | 4 +- .../test/generator-stability.test.ts | 4 +- .../test/parse-response.test.ts | 6 +-- .../test/query-compiler.test.ts | 4 +- .../test/search-engine.test.ts | 4 +- packages/search/README.md | 34 +++++++++--- packages/search/package.json | 2 +- packages/search/src/engine.ts | 46 ++++++++-------- packages/search/src/index.ts | 6 ++- packages/search/src/project.ts | 30 +++++------ packages/search/src/schema.ts | 38 ++++++++++---- packages/search/test/engine.test.ts | 6 +-- packages/search/test/project.test.ts | 18 ++++--- packages/search/test/schema.test.ts | 4 +- 26 files changed, 247 insertions(+), 194 deletions(-) diff --git a/docs/decisions/0003-search-api-core-query-model.md b/docs/decisions/0003-search-api-core-query-model.md index 09e08a4a..df74737c 100644 --- a/docs/decisions/0003-search-api-core-query-model.md +++ b/docs/decisions/0003-search-api-core-query-model.md @@ -85,7 +85,9 @@ interface SearchField { type Derivation = (document: SearchDocument, node: FramedNode) => void; -interface SearchSchema { +// One root type (one SHACL NodeShape); a whole deployment’s declaration is the +// SearchSchema, a map of SearchTypes keyed by type IRI (built with searchSchema()). +interface SearchType { readonly type: string; // sh:targetClass readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; // computed fields: status, booleans @@ -194,15 +196,15 @@ SearchEngine` readable. ```ts // FacetField / OutputField default to `string` (ergonomic) and a deployment narrows them -// to its schema’s facetable / output field names for typo-safe facet and document access -// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). +// to its type’s facetable / output field names for typo-safe facet and document access +// (helpers FacetFieldsOf / OutputFieldsOf, or the EngineFor alias). interface SearchEngine< FacetField extends string = string, OutputField extends string = string, > { search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise>; } diff --git a/docs/decisions/0004-search-api-graphql-surface.md b/docs/decisions/0004-search-api-graphql-surface.md index c5b297da..678d6d04 100644 --- a/docs/decisions/0004-search-api-graphql-surface.md +++ b/docs/decisions/0004-search-api-graphql-surface.md @@ -21,23 +21,23 @@ separate package). ### Runtime configuration, not code generation The surface is **constructed at runtime from the field-model configuration** -(`buildSearchSchema(config)`), once at startup, with generic resolvers shipped in the package +(`buildGraphQLSchema(config)`), once at startup, with generic resolvers shipped in the package attached to that schema – nothing is emitted or committed. The resolvers are inherently generic (one root resolver maps args to a `SearchQuery`, calls the engine, and maps the result back; the field model only parameterises data), so codegen would emit N near-identical stubs that all delegate to the same logic, plus a build step and staleness risk, for no benefit. A live GraphQL API serves its own schema via introspection, so clients need no committed -`.graphql` file; the field-model diff is the reviewable change. `printSearchSchema()` exists +`.graphql` file; the field-model diff is the reviewable change. `printGraphQLSchema()` exists only as an **optional** CI snapshot test guarding the frozen contract against accidental breaking changes – not a shipped artifact. ### The schema-building function ```ts -// Generic over the config *value’s* type (capture it `as const satisfies SearchSchema`), so +// Generic over the config *value’s* type (capture it `as const satisfies SearchType`), so // one declaration drives both the runtime schema and the static TS types below. -function buildSearchSchema( +function buildGraphQLSchema( schema: S, options: { typeName: string; // 'Dataset' – drives all derived type names @@ -54,10 +54,10 @@ function buildSearchSchema( // Static types derived from the SAME config value’s type (compile-time only, erased at // runtime); one source, no codegen, no drift. Exported for typed in-process callers/tests. -type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } -type WhereOf; // { format?: StringFilter; size?: FloatRange; … } -type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } -type FacetOf; // the facetable-field-name union +type OutputOf; // { id: string; title: LanguageString[]; size: number | null; … } +type WhereOf; // { format?: StringFilter; size?: FloatRange; … } +type OrderByOf; // { field: 'RELEVANCE' | 'TITLE' | …; direction: 'ASC' | 'DESC' } +type FacetOf; // the facetable-field-name union // also exported for manual composition / non-default servers: function buildSearchTypeDefsAndResolvers( @@ -65,17 +65,17 @@ function buildSearchTypeDefsAndResolvers( options, ): { typeDefs: string; resolvers: object }; // optional CI helper only: -function printSearchSchema(schema, options): string; // SDL, for a snapshot/breaking-change test +function printGraphQLSchema(schema, options): string; // SDL, for a snapshot/breaking-change test ``` -`buildSearchSchema` is the standalone, framework-agnostic artifact (depends only on +`buildGraphQLSchema` is the standalone, framework-agnostic artifact (depends only on `graphql` + `@graphql-tools/schema`). Deep customisation passes `extendTypeDefs`/ `extendResolvers` (merged before `makeExecutableSchema`, since Mercurius registers once) or composes the exported typeDefs/resolvers by hand. ### A typed surface the contract does not depend on -One `as const satisfies SearchSchema` declaration drives two **independent** projections: the +One `as const satisfies SearchType` declaration drives two **independent** projections: the **runtime contract** (the `GraphQLSchema`, built at startup by reading the value – `field.kind`, `output`, `facetable`, …) and a **static TS mirror** (`OutputOf` / `WhereOf` / `OrderByOf` / `FacetOf`, computed from `typeof schema` via mapped types). @@ -83,8 +83,8 @@ One `as const satisfies SearchSchema` declaration drives two **independent** pro The contract **does not depend on the TS types.** `as const`/`satisfies` are compile-time only and erased, so the served schema is byte-identical whether or not the mirror exists – it is a developer-experience overlay. The two derivations can drift (the runtime kind→GraphQL-type -mapping lives in `buildSearchSchema`; the type-level mapping in `OutputOf` duplicates it), -so the **contract** is guarded by the optional `printSearchSchema()` SDL snapshot (the real +mapping lives in `buildGraphQLSchema`; the type-level mapping in `OutputOf` duplicates it), +so the **contract** is guarded by the optional `printGraphQLSchema()` SDL snapshot (the real artifact), while the TS mirror only catches our own coding mistakes against it. Values are typed at both ends, with the resolver as the typed transform between them: @@ -294,7 +294,7 @@ untagged (`und`) last – so `[0]` is always the best available value. - **Hot path is the engine, not GraphQL.** Per-request cost is dominated by the Typesense round-trip; parse/validate/resolve of a small query is sub-millisecond. - **Introspection serves the contract** (cheap, client-cached). Leave it on, or disable in - production and use `printSearchSchema` for tooling. + production and use `printGraphQLSchema` for tooling. ### Context contract diff --git a/packages/search-api-graphql/README.md b/packages/search-api-graphql/README.md index 88f8cdb3..d6274a9d 100644 --- a/packages/search-api-graphql/README.md +++ b/packages/search-api-graphql/README.md @@ -1,7 +1,7 @@ # @lde/search-api-graphql The GraphQL surface for the [`@lde/search`](../search) core. **Both engine- and -domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchSchema` +domain-agnostic:** it builds an executable `GraphQLSchema` from any `SearchType` at runtime, and serves it with one generic resolver over any `SearchEngine`. It names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, `CreativeWork`, …) nor your **engine** (the resolver calls `context.engine`, be it @@ -9,16 +9,16 @@ names neither your **domain** (you pass `typeName` — `Dataset`, `Person`, ## Runtime configuration, not codegen -`buildSearchSchema(schema, { typeName })` constructs the schema once at startup -from the field model — no SDL artifact, no generated resolver stubs. The field -model is the single source; the GraphQL contract is whatever it produces. Output -types, the `where`/`orderBy`/facet inputs, reference types and nullability are all -derived from each field’s `kind` and capability flags. +`buildGraphQLSchema(searchType, { typeName })` constructs the schema once at +startup from the field model — no SDL artifact, no generated resolver stubs. The +field model is the single source; the GraphQL contract is whatever it produces. +Output types, the `where`/`orderBy`/facet inputs, reference types and nullability +are all derived from each field’s `kind` and capability flags. ```ts -import { buildSearchSchema } from '@lde/search-api-graphql'; +import { buildGraphQLSchema } from '@lde/search-api-graphql'; -const gqlSchema = buildSearchSchema(DATASET, { +const gqlSchema = buildGraphQLSchema(DATASET, { typeName: 'Dataset', queryDefaults: (query) => ({ ...query, @@ -50,6 +50,7 @@ The surface reads the same field model the index is built from, and compiles int the same neutral `SearchQuery` the engine consumes — so the API, the index and a future REST surface stay in lockstep. The contract is **frozen** (breaking to change), and because it is generated rather than hand-written, a _consumer_ guards -it with a `printSearchSchema(schema, options)` SDL snapshot over its **own** -schema and `typeName` — that snapshot also catches a `buildSearchSchema` change in -a future version of this library silently altering the consumer’s contract. +it with a `printGraphQLSchema(searchType, options)` SDL snapshot over its **own** +search type and `typeName` — that snapshot also catches a `buildGraphQLSchema` +change in a future version of this library silently altering the consumer’s +contract. diff --git a/packages/search-api-graphql/package.json b/packages/search-api-graphql/package.json index ea761b48..70f76450 100644 --- a/packages/search-api-graphql/package.json +++ b/packages/search-api-graphql/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search-api-graphql", "version": "0.1.0", - "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchSchema at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the schema and typeName; it names neither your domain nor your engine.", + "description": "Engine- and domain-agnostic GraphQL surface for @lde/search: builds an executable GraphQLSchema from any SearchType at runtime (no codegen), served by one generic resolver over any SearchEngine. You supply the search type and typeName; it names neither your domain nor your engine.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search-api-graphql" diff --git a/packages/search-api-graphql/src/build-schema.ts b/packages/search-api-graphql/src/build-schema.ts index f7449793..836cc2a6 100644 --- a/packages/search-api-graphql/src/build-schema.ts +++ b/packages/search-api-graphql/src/build-schema.ts @@ -28,7 +28,7 @@ import { type SearchEngine, type SearchField, type SearchQuery, - type SearchSchema, + type SearchType, type Sort, } from '@lde/search'; import { @@ -50,7 +50,7 @@ export interface SearchContext { readonly onFacetError?: (field: string, error: unknown) => void; } -export interface BuildSearchSchemaOptions { +export interface BuildGraphQLSchemaOptions { /** Drives all derived type names, e.g. `Dataset`. */ readonly typeName: string; /** Root query field; defaults to the lowercased plural of `typeName`. */ @@ -86,9 +86,9 @@ function screamingSnake(name: string): string { * arguments to a {@link SearchQuery}, calls `context.engine`, and maps the result * back; the field model only parameterises data. */ -export function buildSearchSchema( - schema: SearchSchema, - options: BuildSearchSchemaOptions, +export function buildGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, ): GraphQLSchema { const { typeName } = options; const languageOrder = options.languageOrder ?? defaultLanguageOrder; @@ -96,7 +96,6 @@ export function buildSearchSchema( options.queryField ?? `${typeName.charAt(0).toLowerCase()}${typeName.slice(1)}s`; - // --- Shared types --- const languageString = new GraphQLObjectType({ name: 'LanguageString', fields: { @@ -165,9 +164,9 @@ export function buildSearchSchema( }, }); - // --- Reference types, one per referenced shape, reused by every field. --- + // One reference type per referenced shape, reused by every field. const referenceTypes = new Map(); - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { if ( field.kind === 'reference' && field.ref && @@ -189,7 +188,6 @@ export function buildSearchSchema( } } - // --- Output type --- const outputType = new GraphQLObjectType({ name: typeName, fields: () => { @@ -199,7 +197,7 @@ export function buildSearchSchema( > = { id: { type: new GraphQLNonNull(GraphQLString) }, }; - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { fields[field.name] = outputFieldConfig(field); } return fields; @@ -263,12 +261,11 @@ export function buildSearchSchema( } } - // --- where / orderBy / facets --- const whereInput = new GraphQLInputObjectType({ name: `${typeName}Where`, fields: () => { const fields: Record = {}; - for (const field of filterableFields(schema)) { + for (const field of filterableFields(searchType)) { fields[field.name] = { type: whereFieldType(field) }; } return fields; @@ -293,7 +290,7 @@ export function buildSearchSchema( const sortValues: GraphQLEnumValueConfigMap = { RELEVANCE: { value: 'relevance' }, }; - for (const field of sortableFields(schema)) { + for (const field of sortableFields(searchType)) { sortValues[screamingSnake(field.name)] = { value: field.name }; } const sortField = new GraphQLEnumType({ @@ -323,7 +320,7 @@ export function buildSearchSchema( string, GraphQLFieldConfig > = {}; - for (const field of facetableFields(schema)) { + for (const field of facetableFields(searchType)) { const isRange = field.facetRanges !== undefined && field.facetRanges.length > 0; fields[field.name] = { @@ -350,7 +347,10 @@ export function buildSearchSchema( // rather than failing the whole query (which would null the non-null // result and discard the items + every other facet). try { - const result = await context.engine.search(facetQuery, schema); + const result = await context.engine.search( + facetQuery, + searchType, + ); return result.facets[field.name] ?? []; } catch (error) { context.onFacetError?.(field.name, error); @@ -392,14 +392,14 @@ export function buildSearchSchema( perPage: { type: GraphQLInt, defaultValue: 20 }, }, resolve: async (_source, args, context: SearchContext) => { - const built = argsToQuery(args as QueryArgs, context, schema); + const built = argsToQuery(args as QueryArgs, context, searchType); const finalQuery = options.queryDefaults ? options.queryDefaults(built, context) : built; // Items + total only; facets are resolved lazily per selected key. const result = await context.engine.search( { ...finalQuery, facets: [] }, - schema, + searchType, ); return { items: result.hits.map((hit) => ({ id: hit.id, ...hit.document })), @@ -425,14 +425,14 @@ export function buildSearchSchema( /** * The SDL of the built schema. Not a shipped artifact — a consumer uses it for an * optional CI snapshot test over its own schema, catching accidental breaking - * changes to its frozen contract (including a `buildSearchSchema` change in a + * changes to its frozen contract (including a `buildGraphQLSchema` change in a * future version of this library silently altering it). */ -export function printSearchSchema( - schema: SearchSchema, - options: BuildSearchSchemaOptions, +export function printGraphQLSchema( + searchType: SearchType, + options: BuildGraphQLSchemaOptions, ): string { - return printSchema(buildSearchSchema(schema, options)); + return printSchema(buildGraphQLSchema(searchType, options)); } interface QueryArgs { @@ -447,13 +447,13 @@ interface QueryArgs { function argsToQuery( args: QueryArgs, context: SearchContext, - schema: SearchSchema, + searchType: SearchType, ): SearchQuery { const perPage = args.perPage ?? 20; const page = args.page ?? 1; return { text: args.query, - where: whereToFilters(args.where, schema), + where: whereToFilters(args.where, searchType), orderBy: args.orderBy ? [{ field: args.orderBy.field, direction: args.orderBy.direction }] : [], @@ -467,13 +467,13 @@ function argsToQuery( function whereToFilters( where: Record | undefined, - schema: SearchSchema, + searchType: SearchType, ): Filter[] { if (where === undefined) { return []; } const filters: Filter[] = []; - for (const field of filterableFields(schema)) { + for (const field of filterableFields(searchType)) { const value = where[field.name]; if (value === undefined || value === null) { continue; diff --git a/packages/search-api-graphql/src/index.ts b/packages/search-api-graphql/src/index.ts index 2fe7db46..20c13223 100644 --- a/packages/search-api-graphql/src/index.ts +++ b/packages/search-api-graphql/src/index.ts @@ -1,7 +1,7 @@ -export { buildSearchSchema, printSearchSchema } from './build-schema.js'; +export { buildGraphQLSchema, printGraphQLSchema } from './build-schema.js'; export type { SearchContext, - BuildSearchSchemaOptions, + BuildGraphQLSchemaOptions, } from './build-schema.js'; export { defaultLanguageOrder, toLanguageStrings } from './language.js'; export type { LanguageString, LanguageOrder } from './language.js'; diff --git a/packages/search-api-graphql/test/build-schema.test.ts b/packages/search-api-graphql/test/build-schema.test.ts index 6ba323de..243b0ec9 100644 --- a/packages/search-api-graphql/test/build-schema.test.ts +++ b/packages/search-api-graphql/test/build-schema.test.ts @@ -4,11 +4,11 @@ import type { SearchEngine, SearchQuery, SearchResult, - SearchSchema, + SearchType, } from '@lde/search'; -import { buildSearchSchema, type SearchContext } from '../src/build-schema.js'; +import { buildGraphQLSchema, type SearchContext } from '../src/build-schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -125,14 +125,14 @@ async function run( variables?: Record, ) { return graphql({ - schema: buildSearchSchema(schema, { typeName: 'Dataset' }), + schema: buildGraphQLSchema(schema, { typeName: 'Dataset' }), source, contextValue: context, variableValues: variables, }); } -describe('buildSearchSchema', () => { +describe('buildGraphQLSchema', () => { it('resolves a query, mapping the result to the typed output', async () => { const { engine, received } = fakeEngine(canned); const result = await run( @@ -454,7 +454,7 @@ describe('buildSearchSchema', () => { return canned; }, }; - const gqlSchema = buildSearchSchema(schema, { + const gqlSchema = buildGraphQLSchema(schema, { typeName: 'Dataset', queryDefaults: (query) => ({ ...query, @@ -474,7 +474,9 @@ describe('buildSearchSchema', () => { }); it('derives nullability: required scalar non-null, optional scalar nullable, arrays/booleans non-null', () => { - const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); expect(sdl).toMatch(/status: String!/); // required expect(sdl).toMatch(/size: Int\b(?!!)/); // optional → nullable expect(sdl).toMatch(/title: \[LanguageString!\]!/); @@ -484,7 +486,9 @@ describe('buildSearchSchema', () => { }); it('builds the where, orderBy enum and keyed facets object from the field model', () => { - const sdl = printSchema(buildSearchSchema(schema, { typeName: 'Dataset' })); + const sdl = printSchema( + buildGraphQLSchema(schema, { typeName: 'Dataset' }), + ); expect(sdl).toMatch(/enum DatasetSortField/); expect(sdl).toMatch(/RELEVANCE/); expect(sdl).toMatch(/SIZE/); diff --git a/packages/search-api-graphql/test/generator-stability.test.ts b/packages/search-api-graphql/test/generator-stability.test.ts index 78a86f40..c78b1535 100644 --- a/packages/search-api-graphql/test/generator-stability.test.ts +++ b/packages/search-api-graphql/test/generator-stability.test.ts @@ -1,15 +1,15 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; -import { printSearchSchema } from '../src/build-schema.js'; +import type { SearchType } from '@lde/search'; +import { printGraphQLSchema } from '../src/build-schema.js'; /** * A neutral fixture exercising every kind + capability — NOT a real domain. Its * SDL is snapshotted purely to pin the **generator**: any change to how - * `buildSearchSchema` maps the field model (nullability, type names, enums, + * `buildGraphQLSchema` maps the field model (nullability, type names, enums, * reference reuse) surfaces as a snapshot diff before this library is published, * so a consumer’s contract can’t shift from under it by accident. */ -const THING: SearchSchema = { +const THING: SearchType = { type: 'https://example.org/Thing', fields: [ { @@ -92,6 +92,6 @@ const THING: SearchSchema = { describe('GraphQL generator stability', () => { it('emits a stable SDL for a representative schema', () => { - expect(printSearchSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); + expect(printGraphQLSchema(THING, { typeName: 'Thing' })).toMatchSnapshot(); }); }); diff --git a/packages/search-typesense/README.md b/packages/search-typesense/README.md index ea681cae..efffc145 100644 --- a/packages/search-typesense/README.md +++ b/packages/search-typesense/README.md @@ -2,7 +2,7 @@ [Typesense](https://typesense.org/) engine adapter for the engine- and domain-agnostic [`@lde/search`](../search) core. **Engine-specific (Typesense) but -domain-agnostic** – you supply a `SearchSchema`; this package never names your +domain-agnostic** – you supply a `SearchType`; this package never names your domain. It is the Typesense implementation of the `SearchEngine` port: it derives a collection schema from the field model, compiles the neutral `SearchQuery` into Typesense search params, runs it, reconstructs the engine-neutral `SearchResult`, @@ -10,10 +10,10 @@ and manages the index lifecycle (blue/green rebuild). ## Collection schema and engine -`buildCollectionSchema(schema, { name, defaultSortingField, … })` derives a +`buildCollectionSchema(searchType, { name, defaultSortingField, … })` derives a Typesense collection from the unified `SearchField` model — the Typesense field type comes from each field’s `kind`, and the physical fanout (per-locale -search/sort keys, the `_group` companion) matches what the projection writes, via +search/sort keys) matches what the projection writes, via `@lde/search`’s `physicalFields`, so the index and the documents cannot drift. `createTypesenseSearchEngine(client, { collection, labelsCollection })` is the diff --git a/packages/search-typesense/src/collection-schema.ts b/packages/search-typesense/src/collection-schema.ts index 37f0d378..af133b08 100644 --- a/packages/search-typesense/src/collection-schema.ts +++ b/packages/search-typesense/src/collection-schema.ts @@ -1,10 +1,6 @@ import type { CollectionCreateSchema } from 'typesense'; import type { CollectionFieldSchema } from 'typesense/lib/Typesense/Collection.js'; -import { - physicalFields, - type SearchField, - type SearchSchema, -} from '@lde/search'; +import { physicalFields, type SearchField, type SearchType } from '@lde/search'; /** Deployment-specific options the generic field model does not carry. */ export interface CollectionSchemaOptions { @@ -20,7 +16,7 @@ export interface CollectionSchemaOptions { } /** - * Build a Typesense collection schema from the unified {@link SearchSchema}, so + * Build a Typesense collection schema from the unified {@link SearchType}, so * the index and the projection are driven by one declarative source and cannot * drift. Each field fans out into the same physical fields the projection writes * ({@link physicalFields}); the Typesense field type is derived from the field @@ -31,13 +27,13 @@ export interface CollectionSchemaOptions { * field stems in `defaultLocale`. */ export function buildCollectionSchema( - schema: SearchSchema, + searchType: SearchType, options: CollectionSchemaOptions, ): CollectionCreateSchema { const defaultLocale = options.defaultLocale ?? 'nl'; const collection: CollectionCreateSchema = { name: options.name, - fields: schema.fields.flatMap((field) => + fields: searchType.fields.flatMap((field) => typesenseFields(field, defaultLocale, options.defaultSortingField), ), }; diff --git a/packages/search-typesense/src/query-compiler.ts b/packages/search-typesense/src/query-compiler.ts index 662eb393..dfeede8c 100644 --- a/packages/search-typesense/src/query-compiler.ts +++ b/packages/search-typesense/src/query-compiler.ts @@ -7,7 +7,7 @@ import { type Filter, type SearchField, type SearchQuery, - type SearchSchema, + type SearchType, type Sort, } from '@lde/search'; @@ -31,17 +31,17 @@ export interface CompileOptions { export function buildSearchParams( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, options: CompileOptions = {}, ): SearchParams { const folded = query.text !== undefined && query.text.length > 0 ? fold(query.text) : undefined; - const { names, weights } = queryFields(schema, query.locale); - const filterBy = compileFilterBy(query.where, schema); + const { names, weights } = queryFields(searchType, query.locale); + const filterBy = compileFilterBy(query.where, searchType); const sortBy = query.orderBy - .map((sort) => compileSort(sort, schema, query.locale)) + .map((sort) => compileSort(sort, searchType, query.locale)) .join(','); const params: SearchParams = { q: folded ?? '*', @@ -59,7 +59,7 @@ export function buildSearchParams( params.sort_by = sortBy; } if (query.facets.length > 0) { - params.facet_by = compileFacetBy(query.facets, schema); + params.facet_by = compileFacetBy(query.facets, searchType); if (options.maxFacetValues !== undefined) { params.max_facet_values = options.maxFacetValues; } @@ -76,11 +76,13 @@ export function buildSearchParams( */ function compileFacetBy( facets: readonly string[], - schema: SearchSchema, + searchType: SearchType, ): string { return facets .map((name) => { - const field = schema.fields.find((candidate) => candidate.name === name); + const field = searchType.fields.find( + (candidate) => candidate.name === name, + ); return field?.facetRanges !== undefined && field.facetRanges.length > 0 ? compileRangeFacet(field.name, field.facetRanges) : name; @@ -107,12 +109,12 @@ function compileRangeFacet( * still surface. */ function queryFields( - schema: SearchSchema, + searchType: SearchType, locale: string, ): { readonly names: string[]; readonly weights: number[] } { const names: string[] = []; const weights: number[] = []; - for (const field of searchableFields(schema)) { + for (const field of searchableFields(searchType)) { const search = physicalFields(field).search; const baseWeight = field.searchable.weight; if (field.kind === 'text' && field.localized === true) { @@ -136,19 +138,19 @@ function queryFields( /** AND-join the compiled `where` clauses; skips unknown fields and empty clauses. */ function compileFilterBy( where: readonly Filter[], - schema: SearchSchema, + searchType: SearchType, ): string { return where - .map((filter) => compileFilter(filter, schema)) + .map((filter) => compileFilter(filter, searchType)) .filter((clause): clause is string => clause !== undefined) .join(' && '); } function compileFilter( filter: Filter, - schema: SearchSchema, + searchType: SearchType, ): string | undefined { - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === filter.field, ); if (field === undefined) { @@ -209,11 +211,15 @@ function compileRange( * text field sorts on its active-locale folded key; any other field (including a * deployment tie-break like `status_rank`) sorts on its own name. */ -function compileSort(sort: Sort, schema: SearchSchema, locale: string): string { +function compileSort( + sort: Sort, + searchType: SearchType, + locale: string, +): string { if (sort.field === 'relevance') { return `_text_match:${sort.direction}`; } - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === sort.field, ); if ( diff --git a/packages/search-typesense/src/search.ts b/packages/search-typesense/src/search.ts index 3e2a9959..fc52aca4 100644 --- a/packages/search-typesense/src/search.ts +++ b/packages/search-typesense/src/search.ts @@ -10,7 +10,7 @@ import { type SearchHit, type SearchQuery, type SearchResult, - type SearchSchema, + type SearchType, type SearchValue, } from '@lde/search'; import { buildSearchParams, escapeFilterValue } from './query-compiler.js'; @@ -90,9 +90,9 @@ export function createTypesenseSearchEngine( return { async search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise { - const params = buildSearchParams(query, schema, { + const params = buildSearchParams(query, searchType, { maxFacetValues: options.maxFacetValues, }); const response = (await client @@ -111,20 +111,20 @@ export function createTypesenseSearchEngine( options.labelsCollection, options.labelCacheTtlMs, ); - labels = selectLabels(allLabels, referenceIris(response, schema)); + labels = selectLabels(allLabels, referenceIris(response, searchType)); } else { try { labels = await fetchLabels( client, options.labelsCollection, - referenceIris(response, schema), + referenceIris(response, searchType), ); } catch (error) { options.onLabelError?.(error); } } } - return parseSearchResponse(response, schema, labels); + return parseSearchResponse(response, searchType, labels); }, }; } @@ -169,17 +169,17 @@ function selectLabels( /** Every distinct reference IRI whose label the result will actually use. */ function referenceIris( response: TypesenseSearchResponse, - schema: SearchSchema, + searchType: SearchType, ): string[] { const referenceFieldSet = new Set( - schema.fields + searchType.fields .filter((field) => field.kind === 'reference') .map((field) => field.name), ); // Hits only carry labels for OUTPUT reference fields: reconstructDocument skips // non-output fields, so resolving a non-output reference's hit labels (e.g. a // facet-only `class` with dozens of IRIs per hit) is pure waste. - const outputReferenceFields = outputFields(schema) + const outputReferenceFields = outputFields(searchType) .filter((field) => field.kind === 'reference') .map((field) => field.name); const iris = new Set(); @@ -288,17 +288,17 @@ export interface TypesenseSearchResponse { */ export function parseSearchResponse( response: TypesenseSearchResponse, - schema: SearchSchema, + searchType: SearchType, labels: ReadonlyMap, ): SearchResult { const hits: SearchHit[] = (response.hits ?? []).map((hit) => ({ id: String(hit.document.id), - document: reconstructDocument(hit.document, schema, labels), + document: reconstructDocument(hit.document, searchType, labels), })); // Reference facets are IRI-keyed; their buckets carry a resolved data label. // Plain facets (tokens, free strings) carry no label — the consumer owns display. const referenceFacets = new Set( - schema.fields + searchType.fields .filter((field) => field.kind === 'reference') .map((field) => field.name), ); @@ -307,7 +307,7 @@ export function parseSearchResponse( const labelled = referenceFacets.has(facet.field_name); // A range facet echoes the declared range key as the bucket value; look the // bin's half-open bounds back up by key so the bucket is self-describing. - const field = schema.fields.find( + const field = searchType.fields.find( (candidate) => candidate.name === facet.field_name, ); const rangesByKey = @@ -332,11 +332,11 @@ export function parseSearchResponse( /** Rebuild one logical document from a flat Typesense document. */ function reconstructDocument( flat: Record, - schema: SearchSchema, + searchType: SearchType, labels: ReadonlyMap, ): ResultDocument { const document: Record = {}; - for (const field of outputFields(schema)) { + for (const field of outputFields(searchType)) { if (field.kind === 'boolean') { // A boolean is always present; an absent value means false. document[field.name] = flat[field.name] === true; diff --git a/packages/search-typesense/test/collection-schema.test.ts b/packages/search-typesense/test/collection-schema.test.ts index 8d82507d..49711c1e 100644 --- a/packages/search-typesense/test/collection-schema.test.ts +++ b/packages/search-typesense/test/collection-schema.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; +import type { SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/generator-stability.test.ts b/packages/search-typesense/test/generator-stability.test.ts index bb7eca2a..9b93d134 100644 --- a/packages/search-typesense/test/generator-stability.test.ts +++ b/packages/search-typesense/test/generator-stability.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from 'vitest'; -import type { SearchSchema } from '@lde/search'; +import type { SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; /** @@ -9,7 +9,7 @@ import { buildCollectionSchema } from '../src/collection-schema.js'; * types, the physical fanout, stem/locale, optional/default-sorting-field) * surfaces as a snapshot diff before this library is published. */ -const THING: SearchSchema = { +const THING: SearchType = { type: 'https://example.org/Thing', fields: [ { diff --git a/packages/search-typesense/test/parse-response.test.ts b/packages/search-typesense/test/parse-response.test.ts index 55a09bdd..b59b1a47 100644 --- a/packages/search-typesense/test/parse-response.test.ts +++ b/packages/search-typesense/test/parse-response.test.ts @@ -1,5 +1,5 @@ import { afterEach, describe, expect, it, vi } from 'vitest'; -import type { LocalizedValue, SearchQuery, SearchSchema } from '@lde/search'; +import type { LocalizedValue, SearchQuery, SearchType } from '@lde/search'; import type { Client } from 'typesense'; import { createTypesenseSearchEngine, @@ -7,7 +7,7 @@ import { parseSearchResponse, } from '../src/search.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { @@ -150,7 +150,7 @@ describe('parseSearchResponse', () => { }); describe('parseSearchResponse range facets', () => { - const rangeSchema: SearchSchema = { + const rangeSchema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/query-compiler.test.ts b/packages/search-typesense/test/query-compiler.test.ts index 6556e7b3..9a06d0f8 100644 --- a/packages/search-typesense/test/query-compiler.test.ts +++ b/packages/search-typesense/test/query-compiler.test.ts @@ -1,8 +1,8 @@ import { describe, expect, it } from 'vitest'; -import type { SearchQuery, SearchSchema } from '@lde/search'; +import type { SearchQuery, SearchType } from '@lde/search'; import { buildSearchParams } from '../src/query-compiler.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search-typesense/test/search-engine.test.ts b/packages/search-typesense/test/search-engine.test.ts index 8847d2d7..32f94a59 100644 --- a/packages/search-typesense/test/search-engine.test.ts +++ b/packages/search-typesense/test/search-engine.test.ts @@ -1,11 +1,11 @@ import { afterAll, beforeAll, describe, expect, it } from 'vitest'; import type { Client } from 'typesense'; -import type { SearchEngine, SearchQuery, SearchSchema } from '@lde/search'; +import type { SearchEngine, SearchQuery, SearchType } from '@lde/search'; import { buildCollectionSchema } from '../src/collection-schema.js'; import { createTypesenseSearchEngine } from '../src/search.js'; import { TypesenseContainer } from './typesense-container.js'; -const datasetSchema: SearchSchema = { +const datasetSchema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [ { diff --git a/packages/search/README.md b/packages/search/README.md index 476170d9..ca84cd21 100644 --- a/packages/search/README.md +++ b/packages/search/README.md @@ -8,8 +8,9 @@ defined here. The library never names your domain — the same core drives a It provides four things: -- **the unified field model** — `SearchField` / `SearchSchema`: one declaration - per field that drives all four consumers below, so they cannot drift; +- **the unified field model** — `SearchField` / `SearchType` / `SearchSchema`: + one declaration per field that drives all four consumers below, so they + cannot drift; - **the neutral query IR** — `SearchQuery` / `Filter` / `Sort` + filter semantics, the shared compiler target every API surface parses into; - **the engine port** — `SearchEngine` and the logical result types @@ -29,17 +30,36 @@ plus capability flags (`searchable` / `filterable` / `facetable` / `sortable` / `output`) describe projection, the engine collection schema, the query semantics, and the API output in a single place. +## Terminology + +The model has three levels, mirroring both SHACL (the source vocabulary) and +GraphQL (one of the surfaces): + +| Term | What it is | SHACL | GraphQL | +| -------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | ----------- | +| `SearchField` | One queryable field: a `kind`, the IR `path` it projects from, and the capability flags it opts into | property shape | field | +| `SearchType` | One root type’s complete declaration: its `type` IRI plus its fields and derivations | NodeShape | object type | +| `SearchSchema` | The whole search declaration: every `SearchType`, keyed by `type` IRI — build one with `searchSchema(...types)` | shapes graph | schema | + +`projectGraph` consumes a `SearchSchema` (it projects every type in one pass); +the engine port and the GraphQL surface operate on one `SearchType` at a time. + ## Field model The mapping is data, not code. Each field declares its `kind`, the IR `path` to read (omit it for a **derived** field, populated by a `derivation`), and the capabilities it opts into. The physical field names a declaration fans out to -(per-locale search/sort keys, the grouped-facet companion) come from +(per-locale search/sort keys) come from `physicalFields`, the single convention projection, the collection schema and the query compiler all share. ```ts -import { projectGraph, irisOf, type SearchSchema } from '@lde/search'; +import { + projectGraph, + irisOf, + searchSchema, + type SearchType, +} from '@lde/search'; const DATASET = { type: 'http://www.w3.org/ns/dcat#Dataset', @@ -74,14 +94,14 @@ const DATASET = { document.classCount = irisOf(node, 'urn:dr:class').length; }, ], -} as const satisfies SearchSchema; +} as const satisfies SearchType; -for await (const document of projectGraph(quads, [DATASET])) { +for await (const document of projectGraph(quads, searchSchema(DATASET))) { // one flat search document per matching subject, streamed } ``` -Capturing the schema with `as const satisfies SearchSchema` keeps the field +Capturing the type with `as const satisfies SearchType` keeps the field literals, so the API surface can derive typed facet/output keys from it (see `@lde/search-api-graphql`). diff --git a/packages/search/package.json b/packages/search/package.json index e81f647f..6e7414c9 100644 --- a/packages/search/package.json +++ b/packages/search/package.json @@ -1,7 +1,7 @@ { "name": "@lde/search", "version": "0.1.2", - "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", + "description": "Engine- and domain-agnostic core for RDF-backed search: a unified declarative field model (SearchField/SearchType/SearchSchema), a neutral query IR, the SearchEngine port with logical result types, and a streaming CONSTRUCT-to-document projection. Bakes in no engine, protocol, or domain.", "repository": { "url": "git+https://github.com/ldelements/lde.git", "directory": "packages/search" diff --git a/packages/search/src/engine.ts b/packages/search/src/engine.ts index bcf61657..1a47bd6b 100644 --- a/packages/search/src/engine.ts +++ b/packages/search/src/engine.ts @@ -1,5 +1,5 @@ import type { SearchQuery } from './query.js'; -import type { SearchSchema } from './schema.js'; +import type { SearchType } from './schema.js'; /** * The engine port — the boundary a concrete engine adapter (e.g. @@ -20,7 +20,7 @@ export interface SearchEngine< > { search( query: SearchQuery, - schema: SearchSchema, + searchType: SearchType, ): Promise>; } @@ -43,38 +43,38 @@ export type FacetMap = Readonly< >; /** - * The facet-field-name union of a schema — the keys a {@link SearchResult}’s - * `facets` can hold. Requires the schema be captured as a literal type - * (`as const satisfies SearchSchema`), so the `facetable: true` flags survive as - * literals; a plain `: SearchSchema` annotation widens them and yields `never`. + * The facet-field-name union of a search type — the keys a {@link SearchResult}’s + * `facets` can hold. Requires the type be captured as a literal + * (`as const satisfies SearchType`), so the `facetable: true` flags survive as + * literals; a plain `: SearchType` annotation widens them and yields `never`. */ -export type FacetFieldsOf = Extract< - Schema['fields'][number], +export type FacetFieldsOf = Extract< + Type['fields'][number], { readonly facetable: true } >['name']; /** - * The output-field-name union of a schema — the keys a {@link ResultDocument} - * can hold. Like {@link FacetFieldsOf}, requires the schema captured as a literal - * (`as const satisfies SearchSchema`). + * The output-field-name union of a search type — the keys a {@link ResultDocument} + * can hold. Like {@link FacetFieldsOf}, requires the type captured as a literal + * (`as const satisfies SearchType`). */ -export type OutputFieldsOf = Extract< - Schema['fields'][number], +export type OutputFieldsOf = Extract< + Type['fields'][number], { readonly output: true } >['name']; -/** A {@link SearchEngine} narrowed to one schema: facet keys and document keys - * fixed to that schema’s facetable / output field names. The schema must be - * captured as `as const satisfies SearchSchema`. */ -export type EngineFor = SearchEngine< - FacetFieldsOf, - OutputFieldsOf +/** A {@link SearchEngine} narrowed to one search type: facet keys and document + * keys fixed to that type’s facetable / output field names. The type must be + * captured as `as const satisfies SearchType`. */ +export type EngineFor = SearchEngine< + FacetFieldsOf, + OutputFieldsOf >; -/** A {@link SearchResult} narrowed to one schema (see {@link EngineFor}). */ -export type ResultFor = SearchResult< - FacetFieldsOf, - OutputFieldsOf +/** A {@link SearchResult} narrowed to one search type (see {@link EngineFor}). */ +export type ResultFor = SearchResult< + FacetFieldsOf, + OutputFieldsOf >; /** diff --git a/packages/search/src/index.ts b/packages/search/src/index.ts index 37bc4db3..5f86c025 100644 --- a/packages/search/src/index.ts +++ b/packages/search/src/index.ts @@ -1,13 +1,14 @@ // Projection: RDF CONSTRUCT quads → flat search documents, driven by the unified -// SearchField/SearchSchema model below (one declaration; the fanout names come +// SearchField/SearchType model below (one declaration; the fanout names come // from `physicalFields`). export { projectGraph, irisOf, literalsOf, firstLiteralOf } from './project.js'; export type { SearchDocument } from './project.js'; // Unified field model: one declaration drives projection, engine collection -// schema, query semantics and the GraphQL surface. Plus the schema selectors and +// schema, query semantics and the GraphQL surface. Plus the field selectors and // the physical field-name convention they all share. export { + searchSchema, physicalFields, searchableFields, facetableFields, @@ -18,6 +19,7 @@ export { export type { FieldKind, SearchField, + SearchType, SearchSchema, Derivation, PhysicalFields, diff --git a/packages/search/src/project.ts b/packages/search/src/project.ts index 71e2416e..5aede395 100644 --- a/packages/search/src/project.ts +++ b/packages/search/src/project.ts @@ -5,6 +5,7 @@ import { physicalFields, type SearchField, type SearchSchema, + type SearchType, } from './schema.js'; /** A flat search document. `id` is the engine document key. */ @@ -12,45 +13,44 @@ export type SearchDocument = { id: string } & Record; /** * Project one framed JSON-LD node into a flat search document: apply each field - * of the schema, then run the derivations (which may read fields the field specs + * of the type, then run the derivations (which may read fields the field specs * already set). The physical field names a field fans out to come from * {@link physicalFields}, the single source shared with the engine collection * schema and the query compiler. */ export function projectDocument( node: FramedNode, - schema: SearchSchema, + searchType: SearchType, ): SearchDocument { const id = node['@id']; if (typeof id !== 'string') { throw new Error( - `Cannot project a ${schema.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, + `Cannot project a ${searchType.type} node without an @id: every search document needs a stable key, and an empty one would collide with other keyless nodes.`, ); } const document: SearchDocument = { id }; - for (const field of schema.fields) { + for (const field of searchType.fields) { applyField(document, node, field); } - for (const derive of schema.derivations ?? []) { + for (const derive of searchType.derivations ?? []) { derive(document, node); } return document; } /** - * Frame `quads` for every schema’s root type and project each node with its - * type’s schema — the multi-shape pipeline. Streams one document at a time so - * memory stays flat. The IR maps to a schema by type, so adding a shape is - * adding a `SearchSchema` (no engine change). + * Frame `quads` for every root type in the schema and project each node with its + * type’s declaration — the multi-shape pipeline. Streams one document at a time + * so memory stays flat. The IR maps to a declaration by type, so adding a shape + * is adding a `SearchType` to the schema (no engine change). */ export async function* projectGraph( quads: readonly Quad[], - schemas: readonly SearchSchema[], + schema: SearchSchema, ): AsyncIterable { - const byType = new Map(schemas.map((schema) => [schema.type, schema])); - for (const schema of byType.values()) { - for await (const node of frameByType(quads, schema.type)) { - yield projectDocument(node, schema); + for (const searchType of schema.values()) { + for await (const node of frameByType(quads, searchType.type)) { + yield projectDocument(node, searchType); } } } @@ -91,7 +91,7 @@ function applyField( isoToUnix(firstLiteralOf(node, path)), ); } - // `boolean` is not projected from a path in current schemas — booleans are + // `boolean` is not projected from a path in current search types — booleans are // derivation-populated (e.g. the compatibility vinkjes). } diff --git a/packages/search/src/schema.ts b/packages/search/src/schema.ts index 41ed5356..7a687925 100644 --- a/packages/search/src/schema.ts +++ b/packages/search/src/schema.ts @@ -110,12 +110,24 @@ export type Derivation = (document: SearchDocument, node: FramedNode) => void; * shapes (and derived fields), `derivations` are its `sh:rule`-shaped computed * fields. A generator emits one of these per NodeShape. */ -export interface SearchSchema { +export interface SearchType { readonly type: string; readonly fields: readonly SearchField[]; readonly derivations?: readonly Derivation[]; } +/** + * The complete search declaration of a deployment: every root {@link SearchType}, + * keyed by its `type` IRI — the runtime form of a whole SHACL shapes graph. + * Build one with {@link searchSchema}. + */ +export type SearchSchema = ReadonlyMap; + +/** Build a {@link SearchSchema} from root-type declarations, keyed by `type`. */ +export function searchSchema(...types: readonly SearchType[]): SearchSchema { + return new Map(types.map((searchType) => [searchType.type, searchType])); +} + /** * The physical engine fields one {@link SearchField} fans out into, grouped by * the role each plays. The single source of truth for the naming convention, so @@ -142,11 +154,11 @@ export interface PhysicalFields { * `searchable` weight. */ export function searchableFields( - schema: SearchSchema, + searchType: SearchType, ): readonly (SearchField & { readonly searchable: { readonly weight: number }; })[] { - return schema.fields + return searchType.fields .filter( (field): field is SearchField & { searchable: { weight: number } } => field.searchable !== undefined, @@ -155,23 +167,27 @@ export function searchableFields( } /** Fields returned as facet buckets, in declaration order. */ -export function facetableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.facetable === true); +export function facetableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.facetable === true); } /** Fields usable in `where`, in declaration order. */ -export function filterableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.filterable === true); +export function filterableFields( + searchType: SearchType, +): readonly SearchField[] { + return searchType.fields.filter((field) => field.filterable === true); } /** Fields publicly selectable in `orderBy`, in declaration order. */ -export function sortableFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.sortable === true); +export function sortableFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.sortable === true); } /** Fields that appear in the API output type, in declaration order. */ -export function outputFields(schema: SearchSchema): readonly SearchField[] { - return schema.fields.filter((field) => field.output === true); +export function outputFields(searchType: SearchType): readonly SearchField[] { + return searchType.fields.filter((field) => field.output === true); } /** Derive the physical engine field names a declaration produces. */ diff --git a/packages/search/test/engine.test.ts b/packages/search/test/engine.test.ts index 54ad819d..14966451 100644 --- a/packages/search/test/engine.test.ts +++ b/packages/search/test/engine.test.ts @@ -1,9 +1,9 @@ import { describe, expect, it } from 'vitest'; import type { EngineFor, SearchEngine, SearchResult } from '../src/engine.js'; import type { SearchQuery } from '../src/query.js'; -import type { SearchSchema } from '../src/schema.js'; +import type { SearchType } from '../src/schema.js'; -const schema: SearchSchema = { +const schema: SearchType = { type: 'http://www.w3.org/ns/dcat#Dataset', fields: [{ name: 'title', kind: 'text', localized: true, locales: ['nl'] }], }; @@ -73,7 +73,7 @@ describe('typed facet and document keys', () => { { name: 'format', kind: 'keyword', array: true, facetable: true }, { name: 'status', kind: 'keyword', facetable: true }, ], - } as const satisfies SearchSchema; + } as const satisfies SearchType; // facets ⊂ { format, status }, document keys ⊂ { title }. These object // literals would not compile if the helpers widened to `string`/`never`. diff --git a/packages/search/test/project.test.ts b/packages/search/test/project.test.ts index 592caac6..2cd261f2 100644 --- a/packages/search/test/project.test.ts +++ b/packages/search/test/project.test.ts @@ -7,7 +7,12 @@ import { irisOf, type SearchDocument, } from '../src/project.js'; -import type { SearchField, SearchSchema, Derivation } from '../src/schema.js'; +import { + searchSchema, + type SearchField, + type SearchType, + type Derivation, +} from '../src/schema.js'; const DR = 'urn:dr:'; const IANA = 'https://www.iana.org/assignments/media-types/'; @@ -80,7 +85,7 @@ const derivations: Derivation[] = [ }, ]; -const schema: SearchSchema = { type: DATASET, fields, derivations }; +const schema: SearchType = { type: DATASET, fields, derivations }; describe('projectDocument', () => { it('projects every field kind and runs derivations', () => { @@ -338,7 +343,7 @@ describe('projectDocument', () => { }); describe('projectGraph', () => { - it('frames each schema’s type and projects matching nodes', async () => { + it('frames each root type in the schema and projects matching nodes', async () => { const quads = new Parser({ format: 'N-Triples' }).parse(` <${rdf.type.value}> <${DATASET}> . <${dcterms.title.value}> "Titel"@nl . @@ -349,9 +354,10 @@ describe('projectGraph', () => { `); const documents: SearchDocument[] = []; - for await (const document of projectGraph(quads, [ - { type: DATASET, fields }, - ])) { + for await (const document of projectGraph( + quads, + searchSchema({ type: DATASET, fields }), + )) { documents.push(document); } diff --git a/packages/search/test/schema.test.ts b/packages/search/test/schema.test.ts index 08ab0fd5..368821a6 100644 --- a/packages/search/test/schema.test.ts +++ b/packages/search/test/schema.test.ts @@ -7,12 +7,12 @@ import { searchableFields, sortableFields, type SearchField, - type SearchSchema, + type SearchType, } from '../src/schema.js'; const DATASET = 'http://www.w3.org/ns/dcat#Dataset'; -const schema: SearchSchema = { +const schema: SearchType = { type: DATASET, fields: [ { From 319c07233f2d272f2f656f0008f7482db955f269 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:47:07 +0200 Subject: [PATCH 12/13] docs(readme): add search packages to the packages table and diagram - add the missing @lde/search-api-graphql row to the packages table - add the search, search-typesense, search-api-graphql and text-normalization dependency edges to the architecture diagram, which lacked the search family entirely --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 627c6bf2..dae6f504 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,11 @@ await pipeline.run(); npm Project RDF into engine-agnostic search documents (framing + a declarative field spec) + + @lde/search-api-graphql + npm + Engine- and domain-agnostic GraphQL surface for search: builds an executable GraphQL schema from any SearchType at runtime + @lde/search-typesense npm @@ -229,6 +234,10 @@ graph TD subgraph Publication fastify-rdf docgen + search --> text-normalization + search-api-graphql --> search + search-typesense --> search + search-typesense --> text-normalization end subgraph Monitoring From fa4c5a0f3972c31204d5d8705902c6c6fc0e3c98 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Thu, 2 Jul 2026 19:53:05 +0200 Subject: [PATCH 13/13] test(search-typesense): update autoUpdate line-coverage threshold --- packages/search-typesense/vite.config.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/search-typesense/vite.config.ts b/packages/search-typesense/vite.config.ts index 9184cdbe..a6245e7b 100644 --- a/packages/search-typesense/vite.config.ts +++ b/packages/search-typesense/vite.config.ts @@ -17,7 +17,7 @@ export default mergeConfig( // exercised, which is why branch coverage is lower. thresholds: { functions: 97.14, - lines: 93.28, + lines: 93.31, branches: 83.75, statements: 93.37, },