From d93a66e8da66a49ceee875d6034c2ada235c0ad1 Mon Sep 17 00:00:00 2001 From: Dave Fowler Date: Sat, 30 May 2026 19:06:02 -0700 Subject: [PATCH 1/3] Add provider-normalized context views --- SPEC.md | 121 +++++++ proposals/agent-schema-views.md | 359 +++++++++++++++++++++ src/agents_schema/dbt.py | 2 + src/agents_schema/destinations.py | 22 ++ src/agents_schema/lookml.py | 2 + src/agents_schema/osi.py | 2 + src/agents_schema/root.py | 19 ++ src/agents_schema/views.py | 513 ++++++++++++++++++++++++++++++ tests/test_connector_root.py | 19 ++ tests/test_destinations.py | 7 +- tests/test_root.py | 21 +- tests/test_views.py | 103 ++++++ 12 files changed, 1187 insertions(+), 3 deletions(-) create mode 100644 proposals/agent-schema-views.md create mode 100644 src/agents_schema/views.py create mode 100644 tests/test_views.py diff --git a/SPEC.md b/SPEC.md index 7e5f62c..cd1fb2f 100644 --- a/SPEC.md +++ b/SPEC.md @@ -82,6 +82,116 @@ The current package delivers one table family per metadata source: Each ingestion replaces its own table family with `CREATE OR REPLACE TABLE` and then inserts the rows parsed from the source metadata. +Each ingestion also refreshes provider-normalized views and generic context views over whichever provider tables currently exist. These views are intended to be familiar drop-in starting points for agents that would otherwise reach for `INFORMATION_SCHEMA.TABLES` or `INFORMATION_SCHEMA.COLUMNS`, while preserving source-provider references for deeper inspection. + +The generic views are documented in `AGENTS.ROOT` under the `core` provider. + +| View | Purpose | +|---|---| +| `AGENTS.TABLES` | `INFORMATION_SCHEMA.TABLES` enriched with matching provider table context. | +| `AGENTS.COLUMNS` | Column/field-like analytical objects from provider tables. | +| `AGENTS.RELATIONSHIPS` | Lineage and semantic relationship edges from provider tables. | +| `AGENTS.METRICS` | Metric/measure-like semantic objects from provider tables. | +| `AGENTS.ENTITIES` | Reserved entity-oriented view; empty until a provider contributes entity metadata. | + +--- + +## Generic Context Views + +The generic views are compatibility-oriented: the leading columns mirror common information-schema concepts where possible, and later columns add provider-derived context that has cross-provider meaning. Provider-normalized views are staging inputs, not new sources of truth. + +Provider-owned normalized views feed the generic views: + +| Generic view | Provider-normalized inputs | +|---|---| +| `AGENTS.TABLES` | `INFORMATION_SCHEMA.TABLES` left joined to `AGENTS.DBT_TABLES`, `AGENTS.LOOKML_TABLES`, `AGENTS.OSI_TABLES` | +| `AGENTS.COLUMNS` | `AGENTS.DBT_COLUMNS`, `AGENTS.LOOKML_COLUMNS`, `AGENTS.OSI_COLUMNS` | +| `AGENTS.RELATIONSHIPS` | `AGENTS.DBT_RELATIONSHIPS`, `AGENTS.OSI_RELATIONSHIPS` | +| `AGENTS.METRICS` | `AGENTS.LOOKML_METRICS`, `AGENTS.OSI_METRICS` | + +A provider participates in `AGENTS.TABLES` by publishing a matching provider-normalized `*_TABLES` view. `AGENTS.TABLES` uses `INFORMATION_SCHEMA.TABLES` as its row spine and left joins every provider-normalized table view by `table_catalog`, `table_schema`, and `table_name`. Provider-specific detail stays in source tables such as `AGENTS.LOOKML_DIMENSION`; the merged view exposes provider-prefixed context columns for the common fields worth carrying forward. + +### `AGENTS.TABLES` + +```sql +CREATE OR REPLACE VIEW AGENTS.TABLES AS ... +``` + +| Column | Description | +|---|---| +| `table_catalog` through `comment` | Native columns from Snowflake `INFORMATION_SCHEMA.TABLES`. | +| `_display_name` | Provider label for the matched table, such as `dbt_display_name`. | +| `_description` | Provider description for the matched table. | +| `_ai_context` | Provider AI context for the matched table. | +| `_source_object_id` | Provider-specific object identifier. | +| `_source_path` | Source file path when available. | +| `_materialization` | Provider materialization when available. | +| `_tags` | Provider tags when available. | +| `memories_count` | Sum of provider memory counts; reserved for memory-provider integration. | +| `warnings_count` | Sum of provider warning counts; reserved for memory-provider integration. | + +### `AGENTS.COLUMNS` + +| Column | Description | +|---|---| +| `table_catalog` | Catalog/database name when known. | +| `table_schema` | Schema name when known. | +| `table_name` | Parent table-like object name. | +| `column_name` | Column/field-like object name. | +| `ordinal_position` | Ordinal position when known. | +| `data_type` | Provider data type when known. | +| `is_nullable` | Nullability when known. | +| `display_name` | Human-facing label when available. | +| `description` | Provider description. | +| `ai_context` | Provider AI context when available. | +| `semantic_type` | Provider semantic field kind when available. | +| `is_time_dimension` | Whether the field is marked as time-like. | +| `expression` | Provider expression or SQL when available. | +| `source_provider` | Provider that contributed the row. | +| `source_object_id` | Provider-specific object identifier. | +| `memories_count` | Reserved for memory-provider integration. | +| `warnings_count` | Reserved for memory-provider integration. | + +### `AGENTS.RELATIONSHIPS` + +| Column | Description | +|---|---| +| `relationship_name` | Relationship or lineage edge name. | +| `from_catalog` | Source catalog/database when known. | +| `from_schema` | Source schema when known. | +| `from_table` | Source table/object. | +| `from_column` | Source column(s) when known. | +| `to_catalog` | Destination catalog/database when known. | +| `to_schema` | Destination schema when known. | +| `to_table` | Destination table/object. | +| `to_column` | Destination column(s) when known. | +| `relationship_type` | Relationship type, such as `lineage` or `semantic_relationship`. | +| `multiplicity` | Multiplicity when known. | +| `source_provider` | Provider that contributed the row. | +| `source_object_id` | Provider-specific object identifier. | +| `memories_count` | Reserved for memory-provider integration. | +| `warnings_count` | Reserved for memory-provider integration. | + +### `AGENTS.METRICS` + +| Column | Description | +|---|---| +| `metric_name` | Metric or measure name. | +| `display_name` | Human-facing label when available. | +| `description` | Provider description. | +| `ai_context` | Provider AI context when available. | +| `expression` | Metric expression or SQL when available. | +| `source_provider` | Provider that contributed the row. | +| `source_object_id` | Provider-specific object identifier. | +| `dataset_name` | Parent dataset when available. | +| `view_name` | Parent LookML view when available. | +| `memories_count` | Reserved for memory-provider integration. | +| `warnings_count` | Reserved for memory-provider integration. | + +### `AGENTS.ENTITIES` + +`AGENTS.ENTITIES` is currently an empty typed view reserved for providers that contribute canonical entity metadata in a future release. + --- ## Source: dbt @@ -446,6 +556,12 @@ The current source provider names are: | `lookml` | `AGENTS.LOOKML_*` | | `osi` | `AGENTS.OSI_*` | +The current core provider name is: + +| Provider | Objects | +|---|---| +| `core` | `AGENTS.ROOT`, `AGENTS.TABLES`, `AGENTS.COLUMNS`, `AGENTS.RELATIONSHIPS`, `AGENTS.METRICS`, `AGENTS.ENTITIES` | + --- ## Summary of Current Tables @@ -453,6 +569,11 @@ The current source provider names are: | Table | Source | Purpose | |---|---|---| | `AGENTS.ROOT` | core | Provider registry upserted by dbt, LookML, and OSI workflows | +| `AGENTS.TABLES` | core | Generic table/object context view | +| `AGENTS.COLUMNS` | core | Generic column/field context view | +| `AGENTS.RELATIONSHIPS` | core | Generic relationship and lineage context view | +| `AGENTS.METRICS` | core | Generic metric and measure context view | +| `AGENTS.ENTITIES` | core | Reserved generic entity context view | | `AGENTS.DBT_MODEL` | dbt | dbt models with schema, materialization, documentation, path, and tags | | `AGENTS.DBT_COLUMN` | dbt | Documented dbt model columns | | `AGENTS.DBT_DEPENDENCY` | dbt | Direct dbt dependency edges | diff --git a/proposals/agent-schema-views.md b/proposals/agent-schema-views.md new file mode 100644 index 0000000..751b679 --- /dev/null +++ b/proposals/agent-schema-views.md @@ -0,0 +1,359 @@ +# Agents Schema Context Views Proposal + +**Status:** Proposal +**Branch:** `agent_schema_views` + +## Summary + +Add information-schema-like views to Agents Schema so agents can query richer metadata through familiar object names: + +```sql +AGENTS.TABLES +AGENTS.COLUMNS +AGENTS.RELATIONSHIPS +AGENTS.METRICS +AGENTS.ENTITIES +``` + +The goal is to make Agents Schema instantly swappable for common `INFORMATION_SCHEMA` exploration patterns while adding richer context. Anywhere an agent would normally ask `INFORMATION_SCHEMA.TABLES` or `INFORMATION_SCHEMA.COLUMNS`, it should be able to ask `AGENTS.TABLES` or `AGENTS.COLUMNS` instead and get the familiar shape plus dbt descriptions, LookML/OSI semantic metadata, memory counts, warnings, source provider references, and eventually profiling or usage context. + +## Motivation + +Most SQL agents already know to inspect: + +```sql +INFORMATION_SCHEMA.TABLES +INFORMATION_SCHEMA.COLUMNS +``` + +But native information schema is too thin for analytic work. It can tell an agent that a column exists, but not: + +- which dbt model documented it +- whether it has LookML or OSI semantic context +- which metric uses it +- whether joining it causes fanout +- whether amounts need scaling +- whether a table is a semantic dataset, staging table, or source mirror + +Agents Schema already has source-specific tables. Context views would provide a generic layer over them. + +## Design Principles + +- **Views, not new source of truth.** Source provider tables remain canonical. +- **Information-schema swappable.** Preserve familiar view names and core columns so agents can reuse existing `INFORMATION_SCHEMA` habits with a richer source. +- **Provider-owned normalization.** Providers publish their own `AGENTS._TABLES`, `AGENTS._COLUMNS`, and related normalized views when they want to participate in the generic layer. +- **Provider-aware.** Preserve `source_provider` and `source_object_id` so agents can drill down. +- **Composable with memory.** If the memory provider exists, views can expose memory/warning counts and optional compact memory text. +- **Sparse first.** Start with dbt/LookML/OSI fields already available today; add warehouse-native metadata later. + +## Proposed Views + +The first columns in each view should intentionally resemble the equivalent `INFORMATION_SCHEMA` view where one exists. Agents and existing metadata snippets should be able to select familiar columns first, then opt into the extended columns. + +Core generic views should not directly know every source table. Instead, each provider maps its native metadata into provider-normalized views with the shared shape: + +```text +AGENTS.DBT_TABLES +AGENTS.DBT_COLUMNS +AGENTS.DBT_RELATIONSHIPS +AGENTS.LOOKML_TABLES +AGENTS.LOOKML_COLUMNS +AGENTS.LOOKML_METRICS +AGENTS.OSI_TABLES +AGENTS.OSI_COLUMNS +AGENTS.OSI_RELATIONSHIPS +AGENTS.OSI_METRICS +``` + +Then `AGENTS.TABLES` is a merged information-schema view over every provider `*_TABLES` view: + +```text +AGENTS.TABLES = + INFORMATION_SCHEMA.TABLES + LEFT JOIN provider *_TABLES views by table_catalog/table_schema/table_name +``` + +Provider-specific fields are appended with provider prefixes, such as `dbt_description`, `lookml_ai_context`, or `osi_source_object_id`. This keeps native columns like `table_name` unambiguous while letting providers enrich matching warehouse tables. + +Other generic views can start as unions over provider-normalized views until they get their own native information-schema spine. Provider-specific detail remains in the raw provider tables and is reachable through provider-prefixed source object columns. + +### `AGENTS.TABLES` + +One row per native warehouse table or view from `INFORMATION_SCHEMA.TABLES`, enriched by any provider-normalized `*_TABLES` view that matches the same catalog/schema/table identity. + +Suggested columns: + +```text +table_catalog +table_schema +table_name +table_type +table_owner +is_transient +clustering_key +row_count +bytes +retention_time +created +last_altered +comment +dbt_description +dbt_source_object_id +dbt_source_path +dbt_materialization +dbt_tags +lookml_description +lookml_ai_context +lookml_source_object_id +osi_description +osi_ai_context +osi_source_object_id +memories_count +warnings_count +``` + +Provider mappings: + +| Source | Mapping | +|---|---| +| `DBT_MODEL` | `schema_name`, `name`, `description`, `materialization`, `file_path`, `tags` | +| `LOOKML_VIEW` | `sql_table_name` when parseable, `name`, `label`, `description`, `ai_context`, `file_path` | +| `OSI_DATASET` | `source_table`, `name`, `description`, `ai_context` | + +Memory contribution: + +- table-anchored memories increment `memories_count` +- warning-bearing table memories increment `warnings_count` +- an optional future `agent_memories` field can aggregate compact memory summaries + +### `AGENTS.COLUMNS` + +One row per field/column-like object. + +Suggested columns: + +```text +table_catalog +table_schema +table_name +column_name +ordinal_position +data_type +is_nullable +display_name +description +ai_context +semantic_type +is_time_dimension +expression +source_provider +source_object_id +memories_count +warnings_count +``` + +Provider mappings: + +| Source | Mapping | +|---|---| +| `DBT_COLUMN` + `DBT_MODEL` | model schema/name, `column_name`, `data_type`, `description` | +| `LOOKML_DIMENSION` | `view_name`, `field_name`, `field_kind`, `type`, `sql`, `description`, `ai_context`, `primary_key` | +| `LOOKML_MEASURE` | `view_name`, `measure_name`, `type`, `sql`, `description`, `ai_context`, `filters` | +| `OSI_FIELD` + `OSI_DATASET` | dataset source table/name, `field_name`, `label`, `description`, `ai_context`, `is_time_dimension`, `expression` | + +Memory contribution: + +- column-anchored memories attach directly +- unit rules, enum meanings, timezone warnings, and null semantics can show up in memory counts + +### `AGENTS.RELATIONSHIPS` + +One row per relationship or dependency edge. + +Suggested columns: + +```text +relationship_name +from_catalog +from_schema +from_table +from_column +to_catalog +to_schema +to_table +to_column +relationship_type +multiplicity +source_provider +source_object_id +memories_count +warnings_count +``` + +Provider mappings: + +| Source | Mapping | +|---|---| +| `DBT_DEPENDENCY` | lineage edge from upstream node to downstream model | +| `OSI_RELATIONSHIP` | explicit semantic relationship with from/to datasets and columns | +| LookML explores | future: join graph from explore definitions once modeled in a table | + +Memory contribution: + +- relationship-anchored memories attach directly +- fanout warnings and safe-join rules surface during join planning + +### `AGENTS.METRICS` + +One row per metric or measure-like semantic object. + +Suggested columns: + +```text +metric_name +display_name +description +ai_context +expression +source_provider +source_object_id +dataset_name +view_name +memories_count +warnings_count +``` + +Provider mappings: + +| Source | Mapping | +|---|---| +| `OSI_METRIC` | metric name, description, ai_context, expression | +| `LOOKML_MEASURE` | measure name, view name, type/sql/filter expression, description, ai_context | +| dbt semantic layer | future provider | + +Memory contribution: + +- metric-anchored memories attach directly +- calculation caveats, exclusions, date policies, and unit rules show up near metrics + +### `AGENTS.ENTITIES` + +One row per canonical business entity when a provider contributes entity metadata. + +Suggested columns: + +```text +entity_id +display_name +description +source_provider +source_object_id +primary_table_schema +primary_table_name +primary_key_columns +memories_count +warnings_count +``` + +Initial provider mappings may be sparse. OSI entity-like structures, dbt semantic models, or custom providers can populate this later. + +Memory contribution: + +- entity-anchored memories define identity rules and cross-source mappings +- examples: account is canonical customer, email is not stable identity, subscription is billing relationship not customer + +## Example Queries + +Column lookup with richer context: + +```sql +SELECT + table_schema, + table_name, + column_name, + data_type, + description, + ai_context, + memories_count, + warnings_count +FROM AGENTS.COLUMNS +WHERE LOWER(column_name) LIKE '%amount%'; +``` + +Find semantic tables with warning-bearing memories: + +```sql +SELECT + table_schema, + table_name, + description, + source_provider, + warnings_count +FROM AGENTS.TABLES +WHERE warnings_count > 0; +``` + +Find metrics with context: + +```sql +SELECT + metric_name, + description, + ai_context, + expression, + source_provider +FROM AGENTS.METRICS +WHERE LOWER(metric_name) IN ('arr', 'mrr', 'revenue'); +``` + +## Memory Provider Interaction + +The views should not own memories. They should consume a memory provider if present. + +If `AGENTS.MEMORY` and `AGENTS.MEMORY_ANCHOR` exist: + +- `AGENTS.TABLES` can count table-anchored memories. +- `AGENTS.COLUMNS` can count column-anchored memories. +- `AGENTS.RELATIONSHIPS` can count relationship-anchored memories. +- `AGENTS.METRICS` can count metric-anchored memories. +- `AGENTS.ENTITIES` can count entity-anchored memories. + +This keeps memory normalized while making the generic views useful for agents that do not know how to join memory tables yet. + +## Should Views Be In Core? + +Yes eventually, but they can start as a proposal or optional package because they introduce cross-provider semantics. + +The current source tables are provider-owned and easy to reason about. Views add a second layer: + +```text +source provider tables -> provider-normalized views -> generic context views -> agent queries +``` + +That layer should have tests that pin: + +- row identity rules +- duplicate handling when multiple providers describe the same object +- how descriptions and `ai_context` are selected or combined +- behavior when the memory provider is absent + +## Duplicate And Merge Policy + +The hard part is not defining view columns; it is merging provider records. + +Recommended first version: + +- do not aggressively merge objects from different providers +- emit one row per provider object +- preserve `source_provider` and `source_object_id` +- let agents decide which source to trust when duplicates exist + +Later versions can add canonicalization if Agents Schema gains stable warehouse object identifiers. + +## Open Questions + +- Should views be materialized by the CLI or created as warehouse views? +- Should the first implementation include only dbt/OSI and leave LookML table parsing out? +- Should `TABLES`/`COLUMNS` include native warehouse objects, or only objects contributed by source providers? +- Should memory counts require the memory provider, or should views omit those columns until memory ships? +- Should `AGENTS.COLUMNS` include measures, or should measures live only in `AGENTS.METRICS`? diff --git a/src/agents_schema/dbt.py b/src/agents_schema/dbt.py index 5200a01..97103c2 100644 --- a/src/agents_schema/dbt.py +++ b/src/agents_schema/dbt.py @@ -6,6 +6,7 @@ from .destinations import Column, Destination, TableSchema, open_destination from .root import upsert_provider_root +from .views import create_context_views __all__ = ["run"] @@ -51,6 +52,7 @@ def run(cfg: dict) -> None: upsert_provider_root(dest, "dbt") _create_tables(dest) _ingest(dest, manifest) + create_context_views(dest) def _load_manifest(path: Path) -> dict: diff --git a/src/agents_schema/destinations.py b/src/agents_schema/destinations.py index 94493ba..2709173 100644 --- a/src/agents_schema/destinations.py +++ b/src/agents_schema/destinations.py @@ -37,6 +37,8 @@ def array_indexes(self) -> set[int]: class Destination(Protocol): def replace_table(self, table: TableSchema) -> None: ... + def replace_view(self, name: str, sql: str) -> None: ... + def existing_table_names(self) -> set[str]: ... def upsert_rows(self, table: TableSchema, rows: Iterable[tuple[Any, ...]]) -> None: ... def insert_rows(self, table: TableSchema, rows: Iterable[tuple[Any, ...]]) -> None: ... def close(self) -> None: ... @@ -65,6 +67,22 @@ def replace_table(self, table: TableSchema) -> None: cur.execute(f"CREATE SCHEMA IF NOT EXISTS {self._agents_schema}") cur.execute(_create_table_sql(table, self._agents_schema)) + def replace_view(self, name: str, sql: str) -> None: + with self._con.cursor() as cur: + cur.execute(f"CREATE SCHEMA IF NOT EXISTS {self._agents_schema}") + cur.execute(_create_view_sql(name, sql, self._agents_schema)) + + def existing_table_names(self) -> set[str]: + with self._con.cursor() as cur: + cur.execute( + "SELECT LOWER(table_name) " + "FROM information_schema.tables " + "WHERE table_schema = UPPER(%s) " + "AND table_type = 'BASE TABLE'", + (self._agents_schema,), + ) + return {str(row[0]) for row in cur.fetchall()} + def upsert_rows(self, table: TableSchema, rows: Iterable[tuple[Any, ...]]) -> None: bind_rows = _bind_rows(table, rows) if not bind_rows: @@ -250,6 +268,10 @@ def _create_table_if_not_exists_sql(table: TableSchema, schema: str) -> str: return _create_table_statement_sql("CREATE TABLE IF NOT EXISTS", table, schema) +def _create_view_sql(name: str, sql: str, schema: str) -> str: + return f"CREATE OR REPLACE VIEW {schema}.{_identifier(name)} AS\n{sql}" + + def _create_table_statement_sql(prefix: str, table: TableSchema, schema: str) -> str: definitions = [] for column in table.columns: diff --git a/src/agents_schema/lookml.py b/src/agents_schema/lookml.py index d130b11..36620aa 100644 --- a/src/agents_schema/lookml.py +++ b/src/agents_schema/lookml.py @@ -8,6 +8,7 @@ from .destinations import Column, Destination, TableSchema, open_destination from .root import upsert_provider_root +from .views import create_context_views __all__ = ["run"] @@ -86,6 +87,7 @@ def run(cfg: dict) -> None: upsert_provider_root(dest, "lookml") _create_tables(dest) _ingest(dest, files, lookml_dir) + create_context_views(dest) def _load_lookml_files(lookml_dir: Path) -> list[Path]: diff --git a/src/agents_schema/osi.py b/src/agents_schema/osi.py index 368b79b..217840e 100644 --- a/src/agents_schema/osi.py +++ b/src/agents_schema/osi.py @@ -10,6 +10,7 @@ from .destinations import Column, Destination, TableSchema, open_destination from .root import upsert_provider_root +from .views import create_context_views __all__ = ["run"] @@ -67,6 +68,7 @@ def run(cfg: dict) -> None: upsert_provider_root(dest, "osi") _create_tables(dest) _ingest(dest, models) + create_context_views(dest) def _load_osi_files(osi_dir: Path) -> list[dict]: diff --git a/src/agents_schema/root.py b/src/agents_schema/root.py index c10b125..9fe2ab7 100644 --- a/src/agents_schema/root.py +++ b/src/agents_schema/root.py @@ -16,11 +16,23 @@ ) ROOT_ENTRIES = { + "core": ( + ("overview", "# Core\nShared Agents Schema registry and generic context views."), + ("root", "Provider registry. See AGENTS.ROOT."), + ("tables", "Information-schema-like table/object context view enriched from provider *_TABLES views."), + ("columns", "Information-schema-like column/field context view unioned from provider *_COLUMNS views."), + ("relationships", "Relationship and lineage context view unioned from provider *_RELATIONSHIPS views."), + ("metrics", "Metric and measure context view unioned from provider *_METRICS views."), + ("entities", "Entity context view. Reserved until a provider contributes entity metadata. See AGENTS.ENTITIES."), + ), "dbt": ( ("overview", "# dbt\nTransformation metadata from dbt manifest.json."), ("model", "One row per dbt model. See AGENTS.DBT_MODEL."), ("column", "One row per documented dbt model column. See AGENTS.DBT_COLUMN."), ("dependency", "Direct dbt DAG edges. See AGENTS.DBT_DEPENDENCY."), + ("tables", "Provider-normalized table context view. See AGENTS.DBT_TABLES."), + ("columns", "Provider-normalized column context view. See AGENTS.DBT_COLUMNS."), + ("relationships", "Provider-normalized relationship context view. See AGENTS.DBT_RELATIONSHIPS."), ), "lookml": ( ("overview", "# LookML\nSemantic metadata parsed from LookML files."), @@ -28,6 +40,9 @@ ("dimension", "One row per LookML dimension or dimension group. See AGENTS.LOOKML_DIMENSION."), ("measure", "One row per LookML measure. See AGENTS.LOOKML_MEASURE."), ("explore", "One row per LookML explore. See AGENTS.LOOKML_EXPLORE."), + ("tables", "Provider-normalized table context view. See AGENTS.LOOKML_TABLES."), + ("columns", "Provider-normalized column context view. See AGENTS.LOOKML_COLUMNS."), + ("metrics", "Provider-normalized metric context view. See AGENTS.LOOKML_METRICS."), ), "osi": ( ("overview", "# OSI\nOpen Semantic Interchange metadata parsed from *.osi.yaml files."), @@ -35,6 +50,10 @@ ("field", "One row per OSI dataset field. See AGENTS.OSI_FIELD."), ("metric", "One row per OSI metric. See AGENTS.OSI_METRIC."), ("relationship", "One row per OSI relationship. See AGENTS.OSI_RELATIONSHIP."), + ("tables", "Provider-normalized table context view. See AGENTS.OSI_TABLES."), + ("columns", "Provider-normalized column context view. See AGENTS.OSI_COLUMNS."), + ("relationships", "Provider-normalized relationship context view. See AGENTS.OSI_RELATIONSHIPS."), + ("metrics", "Provider-normalized metric context view. See AGENTS.OSI_METRICS."), ), } diff --git a/src/agents_schema/views.py b/src/agents_schema/views.py new file mode 100644 index 0000000..918161c --- /dev/null +++ b/src/agents_schema/views.py @@ -0,0 +1,513 @@ +"""Information-schema-like context views over provider-normalized views.""" +from __future__ import annotations + +from .destinations import Destination +from .root import upsert_provider_root + +__all__ = [ + "CORE_VIEW_NAMES", + "PROVIDER_VIEW_NAMES", + "create_context_views", + "build_context_view_sql", +] + +CORE_VIEW_NAMES = frozenset({"tables", "columns", "relationships", "metrics", "entities"}) +PROVIDER_VIEW_NAMES = frozenset( + { + "dbt_tables", + "dbt_columns", + "dbt_relationships", + "lookml_tables", + "lookml_columns", + "lookml_metrics", + "osi_tables", + "osi_columns", + "osi_relationships", + "osi_metrics", + } +) +_LOOKML_RELATION_RE = r"^[A-Za-z_][A-Za-z0-9_$]*([.][A-Za-z_][A-Za-z0-9_$]*){0,2}$" + + +def create_context_views(dest: Destination) -> None: + """Create provider-normalized views and generic context views.""" + upsert_provider_root(dest, "core") + for name, sql in build_context_view_sql(dest.existing_table_names()).items(): + dest.replace_view(name, sql) + + +def build_context_view_sql(existing_tables: set[str]) -> dict[str, str]: + existing = {name.lower() for name in existing_tables} + provider_views = _provider_view_sql(existing) + return provider_views | { + "tables": _merge_table_views(provider_views), + "columns": _union_provider_views(provider_views, "columns", _COLUMN_COLUMNS), + "relationships": _union_provider_views(provider_views, "relationships", _RELATIONSHIP_COLUMNS), + "metrics": _union_provider_views(provider_views, "metrics", _METRIC_COLUMNS), + "entities": _union_provider_views(provider_views, "entities", _ENTITY_COLUMNS), + } + + +def _provider_view_sql(existing: set[str]) -> dict[str, str]: + return { + "dbt_tables": _dbt_tables_sql(existing), + "dbt_columns": _dbt_columns_sql(existing), + "dbt_relationships": _dbt_relationships_sql(existing), + "lookml_tables": _lookml_tables_sql(existing), + "lookml_columns": _lookml_columns_sql(existing), + "lookml_metrics": _lookml_metrics_sql(existing), + "osi_tables": _osi_tables_sql(existing), + "osi_columns": _osi_columns_sql(existing), + "osi_relationships": _osi_relationships_sql(existing), + "osi_metrics": _osi_metrics_sql(existing), + } + + +def _union_provider_views(provider_views: dict[str, str], suffix: str, columns: list[tuple[str, str]]) -> str: + selects = [ + f"SELECT {', '.join(name for name, _ in columns)}\nFROM agents.{view_name}" + for view_name in provider_views + if view_name.endswith(f"_{suffix}") + ] + return _union_or_empty(selects, columns) + + +def _merge_table_views(provider_views: dict[str, str]) -> str: + table_views = [view_name for view_name in provider_views if view_name.endswith("_tables")] + provider_selects = [ + _provider_table_select(alias) + for alias in (_provider_alias(view_name) for view_name in table_views) + ] + count_selects = [ + _provider_table_count_select(alias, "memories_count") + for alias in (_provider_alias(view_name) for view_name in table_views) + ] + warning_selects = [ + _provider_table_count_select(alias, "warnings_count") + for alias in (_provider_alias(view_name) for view_name in table_views) + ] + joins = "\n".join( + _provider_table_join(view_name, _provider_alias(view_name)) + for view_name in table_views + ) + return f"""SELECT + t.table_catalog, + t.table_schema, + t.table_name, + t.table_owner, + t.table_type, + t.is_transient, + t.clustering_key, + t.row_count, + t.bytes, + t.retention_time, + t.self_referencing_column_name, + t.reference_generation, + t.user_defined_type_catalog, + t.user_defined_type_schema, + t.user_defined_type_name, + t.is_insertable_into, + t.is_typed, + t.commit_action, + t.created, + t.last_altered, + t.last_ddl, + t.last_ddl_by, + t.auto_clustering_on, + t.comment, + t.is_temporary, + t.is_iceberg, + t.is_dynamic, + t.is_immutable, + t.is_hybrid, + {",\n ".join(provider_selects)}, + {" + ".join(count_selects)} AS memories_count, + {" + ".join(warning_selects)} AS warnings_count +FROM information_schema.tables t +{joins}""" + + +def _provider_alias(view_name: str) -> str: + return view_name.removesuffix("_tables") + + +def _provider_table_select(alias: str) -> str: + return f"""{alias}.display_name AS {alias}_display_name, + {alias}.description AS {alias}_description, + {alias}.ai_context AS {alias}_ai_context, + {alias}.source_object_id AS {alias}_source_object_id, + {alias}.source_path AS {alias}_source_path, + {alias}.materialization AS {alias}_materialization, + {alias}.tags AS {alias}_tags""" + + +def _provider_table_count_select(alias: str, column: str) -> str: + return f"COALESCE({alias}.{column}, 0)" + + +def _provider_table_join(view_name: str, alias: str) -> str: + return f"""LEFT JOIN ( + SELECT + table_catalog, + table_schema, + table_name, + MIN(display_name) AS display_name, + MIN(description) AS description, + MIN(ai_context) AS ai_context, + LISTAGG(source_object_id, ', ') WITHIN GROUP (ORDER BY source_object_id) AS source_object_id, + LISTAGG(source_path, ', ') WITHIN GROUP (ORDER BY source_path) AS source_path, + MIN(materialization) AS materialization, + ANY_VALUE(tags) AS tags, + SUM(memories_count) AS memories_count, + SUM(warnings_count) AS warnings_count + FROM agents.{view_name} + WHERE table_schema IS NOT NULL + AND table_name IS NOT NULL + GROUP BY table_catalog, table_schema, table_name +) {alias} + ON ({alias}.table_catalog IS NULL OR LOWER(t.table_catalog) = LOWER({alias}.table_catalog)) + AND LOWER(t.table_schema) = LOWER({alias}.table_schema) + AND LOWER(t.table_name) = LOWER({alias}.table_name)""" + + +def _union_or_empty(selects: list[str], columns: list[tuple[str, str]]) -> str: + if selects: + return "\nUNION ALL\n".join(selects) + projection = ",\n ".join(f"CAST(NULL AS {kind}) AS {name}" for name, kind in columns) + return f"SELECT\n {projection}\nWHERE 1 = 0" + + +def _lookml_relation_identity_sql(sql_table_name: str, fallback_name: str) -> tuple[str, str, str]: + relation_is_simple = f"REGEXP_LIKE({sql_table_name}, '{_LOOKML_RELATION_RE}')" + part_count = f"REGEXP_COUNT({sql_table_name}, '[.]')" + return ( + f"""CASE + WHEN {relation_is_simple} AND {part_count} = 2 + THEN SPLIT_PART({sql_table_name}, '.', 1) + ELSE CAST(NULL AS VARCHAR) + END AS table_catalog""", + f"""CASE + WHEN {relation_is_simple} AND {part_count} = 2 + THEN SPLIT_PART({sql_table_name}, '.', 2) + WHEN {relation_is_simple} AND {part_count} = 1 + THEN SPLIT_PART({sql_table_name}, '.', 1) + ELSE CAST(NULL AS VARCHAR) + END AS table_schema""", + f"""CASE + WHEN {relation_is_simple} AND {part_count} = 2 + THEN SPLIT_PART({sql_table_name}, '.', 3) + WHEN {relation_is_simple} AND {part_count} = 1 + THEN SPLIT_PART({sql_table_name}, '.', 2) + WHEN {relation_is_simple} AND {part_count} = 0 + THEN {sql_table_name} + ELSE {fallback_name} + END AS table_name""", + ) + + +_TABLE_COLUMNS = [ + ("table_catalog", "VARCHAR"), + ("table_schema", "VARCHAR"), + ("table_name", "VARCHAR"), + ("table_type", "VARCHAR"), + ("display_name", "VARCHAR"), + ("description", "TEXT"), + ("ai_context", "TEXT"), + ("source_provider", "VARCHAR"), + ("source_object_id", "VARCHAR"), + ("source_path", "VARCHAR"), + ("materialization", "VARCHAR"), + ("tags", "VARIANT"), + ("memories_count", "NUMBER"), + ("warnings_count", "NUMBER"), +] + +_COLUMN_COLUMNS = [ + ("table_catalog", "VARCHAR"), + ("table_schema", "VARCHAR"), + ("table_name", "VARCHAR"), + ("column_name", "VARCHAR"), + ("ordinal_position", "NUMBER"), + ("data_type", "VARCHAR"), + ("is_nullable", "BOOLEAN"), + ("display_name", "VARCHAR"), + ("description", "TEXT"), + ("ai_context", "TEXT"), + ("semantic_type", "VARCHAR"), + ("is_time_dimension", "BOOLEAN"), + ("expression", "TEXT"), + ("source_provider", "VARCHAR"), + ("source_object_id", "VARCHAR"), + ("memories_count", "NUMBER"), + ("warnings_count", "NUMBER"), +] + +_RELATIONSHIP_COLUMNS = [ + ("relationship_name", "VARCHAR"), + ("from_catalog", "VARCHAR"), + ("from_schema", "VARCHAR"), + ("from_table", "VARCHAR"), + ("from_column", "VARCHAR"), + ("to_catalog", "VARCHAR"), + ("to_schema", "VARCHAR"), + ("to_table", "VARCHAR"), + ("to_column", "VARCHAR"), + ("relationship_type", "VARCHAR"), + ("multiplicity", "VARCHAR"), + ("source_provider", "VARCHAR"), + ("source_object_id", "VARCHAR"), + ("memories_count", "NUMBER"), + ("warnings_count", "NUMBER"), +] + +_METRIC_COLUMNS = [ + ("metric_name", "VARCHAR"), + ("display_name", "VARCHAR"), + ("description", "TEXT"), + ("ai_context", "TEXT"), + ("expression", "TEXT"), + ("source_provider", "VARCHAR"), + ("source_object_id", "VARCHAR"), + ("dataset_name", "VARCHAR"), + ("view_name", "VARCHAR"), + ("memories_count", "NUMBER"), + ("warnings_count", "NUMBER"), +] + +_ENTITY_COLUMNS = [ + ("entity_id", "VARCHAR"), + ("display_name", "VARCHAR"), + ("description", "TEXT"), + ("source_provider", "VARCHAR"), + ("source_object_id", "VARCHAR"), + ("primary_table_schema", "VARCHAR"), + ("primary_table_name", "VARCHAR"), + ("primary_key_columns", "VARIANT"), + ("memories_count", "NUMBER"), + ("warnings_count", "NUMBER"), +] + + +def _dbt_tables_sql(existing: set[str]) -> str: + if "dbt_model" not in existing: + return _union_or_empty([], _TABLE_COLUMNS) + return """SELECT + CAST(NULL AS VARCHAR) AS table_catalog, + schema_name AS table_schema, + name AS table_name, + 'DBT_MODEL' AS table_type, + name AS display_name, + description, + CAST(NULL AS TEXT) AS ai_context, + 'dbt' AS source_provider, + unique_id AS source_object_id, + file_path AS source_path, + materialization, + tags, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.dbt_model""" + + +def _dbt_columns_sql(existing: set[str]) -> str: + if not {"dbt_model", "dbt_column"}.issubset(existing): + return _union_or_empty([], _COLUMN_COLUMNS) + return """SELECT + CAST(NULL AS VARCHAR) AS table_catalog, + m.schema_name AS table_schema, + m.name AS table_name, + c.column_name, + CAST(NULL AS NUMBER) AS ordinal_position, + c.data_type, + CAST(NULL AS BOOLEAN) AS is_nullable, + c.column_name AS display_name, + c.description, + CAST(NULL AS TEXT) AS ai_context, + CAST(NULL AS VARCHAR) AS semantic_type, + CAST(NULL AS BOOLEAN) AS is_time_dimension, + CAST(NULL AS TEXT) AS expression, + 'dbt' AS source_provider, + c.model_id || '.' || c.column_name AS source_object_id, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.dbt_column c +JOIN agents.dbt_model m ON m.unique_id = c.model_id""" + + +def _dbt_relationships_sql(existing: set[str]) -> str: + if not {"dbt_dependency", "dbt_model"}.issubset(existing): + return _union_or_empty([], _RELATIONSHIP_COLUMNS) + return """SELECT + d.upstream_id || ' -> ' || d.downstream_id AS relationship_name, + CAST(NULL AS VARCHAR) AS from_catalog, + upstream.schema_name AS from_schema, + upstream.name AS from_table, + CAST(NULL AS VARCHAR) AS from_column, + CAST(NULL AS VARCHAR) AS to_catalog, + downstream.schema_name AS to_schema, + downstream.name AS to_table, + CAST(NULL AS VARCHAR) AS to_column, + 'lineage' AS relationship_type, + CAST(NULL AS VARCHAR) AS multiplicity, + 'dbt' AS source_provider, + d.upstream_id || ' -> ' || d.downstream_id AS source_object_id, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.dbt_dependency d +JOIN agents.dbt_model upstream ON upstream.unique_id = d.upstream_id +JOIN agents.dbt_model downstream ON downstream.unique_id = d.downstream_id""" + + +def _lookml_tables_sql(existing: set[str]) -> str: + if "lookml_view" not in existing: + return _union_or_empty([], _TABLE_COLUMNS) + catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("sql_table_name", "name") + return f"""SELECT + {catalog_sql}, + {schema_sql}, + {table_sql}, + 'LOOKML_VIEW' AS table_type, + COALESCE(label, name) AS display_name, + description, + ai_context, + 'lookml' AS source_provider, + name AS source_object_id, + file_path AS source_path, + CAST(NULL AS VARCHAR) AS materialization, + PARSE_JSON('[]') AS tags, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.lookml_view""" + + +def _lookml_columns_sql(existing: set[str]) -> str: + if not {"lookml_dimension", "lookml_view"}.issubset(existing): + return _union_or_empty([], _COLUMN_COLUMNS) + catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("v.sql_table_name", "v.name") + return f"""SELECT + {catalog_sql}, + {schema_sql}, + {table_sql}, + d.field_name AS column_name, + CAST(NULL AS NUMBER) AS ordinal_position, + d.type AS data_type, + CAST(NULL AS BOOLEAN) AS is_nullable, + d.field_name AS display_name, + d.description, + d.ai_context, + d.field_kind AS semantic_type, + d.field_kind = 'dimension_group' AS is_time_dimension, + d.sql AS expression, + 'lookml' AS source_provider, + d.view_name || '.' || d.field_name AS source_object_id, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.lookml_dimension d +JOIN agents.lookml_view v ON v.name = d.view_name""" + + +def _lookml_metrics_sql(existing: set[str]) -> str: + if "lookml_measure" not in existing: + return _union_or_empty([], _METRIC_COLUMNS) + return """SELECT + measure_name AS metric_name, + measure_name AS display_name, + description, + ai_context, + COALESCE(sql, filters) AS expression, + 'lookml' AS source_provider, + view_name || '.' || measure_name AS source_object_id, + CAST(NULL AS VARCHAR) AS dataset_name, + view_name, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.lookml_measure""" + + +def _osi_tables_sql(existing: set[str]) -> str: + if "osi_dataset" not in existing: + return _union_or_empty([], _TABLE_COLUMNS) + catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("source_table", "name") + return f"""SELECT + {catalog_sql}, + {schema_sql}, + {table_sql}, + 'OSI_DATASET' AS table_type, + name AS display_name, + description, + ai_context, + 'osi' AS source_provider, + name AS source_object_id, + CAST(NULL AS VARCHAR) AS source_path, + CAST(NULL AS VARCHAR) AS materialization, + PARSE_JSON('[]') AS tags, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.osi_dataset""" + + +def _osi_columns_sql(existing: set[str]) -> str: + if not {"osi_dataset", "osi_field"}.issubset(existing): + return _union_or_empty([], _COLUMN_COLUMNS) + return """SELECT + CAST(NULL AS VARCHAR) AS table_catalog, + CAST(NULL AS VARCHAR) AS table_schema, + d.source_table AS table_name, + f.field_name AS column_name, + CAST(NULL AS NUMBER) AS ordinal_position, + CAST(NULL AS VARCHAR) AS data_type, + CAST(NULL AS BOOLEAN) AS is_nullable, + COALESCE(f.label, f.field_name) AS display_name, + f.description, + f.ai_context, + CAST(NULL AS VARCHAR) AS semantic_type, + f.is_time_dimension, + f.expression, + 'osi' AS source_provider, + f.dataset_name || '.' || f.field_name AS source_object_id, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.osi_field f +JOIN agents.osi_dataset d ON d.name = f.dataset_name""" + + +def _osi_relationships_sql(existing: set[str]) -> str: + if not {"osi_dataset", "osi_relationship"}.issubset(existing): + return _union_or_empty([], _RELATIONSHIP_COLUMNS) + return """SELECT + r.name AS relationship_name, + CAST(NULL AS VARCHAR) AS from_catalog, + CAST(NULL AS VARCHAR) AS from_schema, + from_dataset.source_table AS from_table, + r.from_columns::TEXT AS from_column, + CAST(NULL AS VARCHAR) AS to_catalog, + CAST(NULL AS VARCHAR) AS to_schema, + to_dataset.source_table AS to_table, + r.to_columns::TEXT AS to_column, + 'semantic_relationship' AS relationship_type, + CAST(NULL AS VARCHAR) AS multiplicity, + 'osi' AS source_provider, + r.name AS source_object_id, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.osi_relationship r +JOIN agents.osi_dataset from_dataset ON from_dataset.name = r.from_dataset +JOIN agents.osi_dataset to_dataset ON to_dataset.name = r.to_dataset""" + + +def _osi_metrics_sql(existing: set[str]) -> str: + if "osi_metric" not in existing: + return _union_or_empty([], _METRIC_COLUMNS) + return """SELECT + name AS metric_name, + name AS display_name, + description, + ai_context, + expression, + 'osi' AS source_provider, + name AS source_object_id, + CAST(NULL AS VARCHAR) AS dataset_name, + CAST(NULL AS VARCHAR) AS view_name, + 0 AS memories_count, + 0 AS warnings_count +FROM agents.osi_metric""" diff --git a/tests/test_connector_root.py b/tests/test_connector_root.py index 043917c..0452e33 100644 --- a/tests/test_connector_root.py +++ b/tests/test_connector_root.py @@ -8,6 +8,16 @@ class FakeDestination: def __init__(self): self.calls = [] + def existing_table_names(self): + return { + call[1].removeprefix("agents.") + for call in self.calls + if call[0] == "replace" + } + + def replace_view(self, name, sql): + self.calls.append(("view", name, sql)) + def upsert_rows(self, table, rows): self.calls.append(("upsert", table.name, list(rows))) @@ -44,6 +54,9 @@ def test_dbt_run_upserts_root_before_source_tables(self): self.assertEqual(dest.calls[0][0], "upsert") self.assertEqual({row[0] for row in dest.calls[0][2]}, {"dbt"}) self.assertEqual([call[0] for call in dest.calls[1:4]], ["replace", "replace", "replace"]) + self.assertEqual(dest.calls[4][0], "upsert") + self.assertEqual({row[0] for row in dest.calls[4][2]}, {"core"}) + self.assertEqual([call[0] for call in dest.calls[5:10]], ["view", "view", "view", "view", "view"]) def test_lookml_run_upserts_root_before_source_tables(self): dest = FakeDestination() @@ -59,6 +72,9 @@ def test_lookml_run_upserts_root_before_source_tables(self): self.assertEqual(dest.calls[0][0], "upsert") self.assertEqual({row[0] for row in dest.calls[0][2]}, {"lookml"}) self.assertEqual([call[0] for call in dest.calls[1:5]], ["replace", "replace", "replace", "replace"]) + self.assertEqual(dest.calls[5][0], "upsert") + self.assertEqual({row[0] for row in dest.calls[5][2]}, {"core"}) + self.assertEqual([call[0] for call in dest.calls[6:11]], ["view", "view", "view", "view", "view"]) def test_osi_run_upserts_root_before_source_tables(self): dest = FakeDestination() @@ -74,6 +90,9 @@ def test_osi_run_upserts_root_before_source_tables(self): self.assertEqual(dest.calls[0][0], "upsert") self.assertEqual({row[0] for row in dest.calls[0][2]}, {"osi"}) self.assertEqual([call[0] for call in dest.calls[1:5]], ["replace", "replace", "replace", "replace"]) + self.assertEqual(dest.calls[5][0], "upsert") + self.assertEqual({row[0] for row in dest.calls[5][2]}, {"core"}) + self.assertEqual([call[0] for call in dest.calls[6:11]], ["view", "view", "view", "view", "view"]) if __name__ == "__main__": diff --git a/tests/test_destinations.py b/tests/test_destinations.py index 5aa0a72..963caa5 100644 --- a/tests/test_destinations.py +++ b/tests/test_destinations.py @@ -1,6 +1,6 @@ import unittest -from agents_schema.destinations import _create_table_if_not_exists_sql, _merge_sql +from agents_schema.destinations import _create_table_if_not_exists_sql, _create_view_sql, _merge_sql from agents_schema.root import ROOT @@ -35,6 +35,11 @@ def test_root_merge_upserts_on_provider_and_key(self): sql, ) + def test_create_view_sql_validates_view_name_and_wraps_query(self): + sql = _create_view_sql("tables", "SELECT 1 AS value", "agents") + + self.assertEqual(sql, "CREATE OR REPLACE VIEW agents.tables AS\nSELECT 1 AS value") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_root.py b/tests/test_root.py index b5078e1..8c4ed4b 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -22,7 +22,10 @@ def test_upsert_provider_root_writes_only_requested_provider(self): self.assertIs(table, ROOT) self.assertTrue(rows) self.assertEqual({row[0] for row in rows}, {"dbt"}) - self.assertEqual({row[1] for row in rows}, {"overview", "model", "column", "dependency"}) + self.assertEqual( + {row[1] for row in rows}, + {"overview", "model", "column", "dependency", "tables", "columns", "relationships"}, + ) def test_upsert_provider_root_has_osi_entries(self): dest = FakeDestination() @@ -30,7 +33,21 @@ def test_upsert_provider_root_has_osi_entries(self): upsert_provider_root(dest, "osi") _, rows = dest.upserts[0] - self.assertEqual({row[1] for row in rows}, {"overview", "dataset", "field", "metric", "relationship"}) + self.assertEqual( + {row[1] for row in rows}, + {"overview", "dataset", "field", "metric", "relationship", "tables", "columns", "relationships", "metrics"}, + ) + + def test_upsert_provider_root_has_core_view_entries(self): + dest = FakeDestination() + + upsert_provider_root(dest, "core") + + _, rows = dest.upserts[0] + self.assertEqual( + {row[1] for row in rows}, + {"overview", "root", "tables", "columns", "relationships", "metrics", "entities"}, + ) if __name__ == "__main__": diff --git a/tests/test_views.py b/tests/test_views.py new file mode 100644 index 0000000..47b9e54 --- /dev/null +++ b/tests/test_views.py @@ -0,0 +1,103 @@ +import unittest + +from agents_schema.views import CORE_VIEW_NAMES, PROVIDER_VIEW_NAMES, build_context_view_sql + + +class ContextViewSqlTests(unittest.TestCase): + def test_builds_provider_views_from_raw_provider_tables(self): + views = build_context_view_sql({"dbt_model", "dbt_column"}) + + self.assertEqual(PROVIDER_VIEW_NAMES | CORE_VIEW_NAMES, set(views)) + self.assertIn("FROM agents.dbt_model", views["dbt_tables"]) + self.assertIn("FROM agents.dbt_column c", views["dbt_columns"]) + + def test_core_tables_enriches_information_schema_with_provider_tables(self): + views = build_context_view_sql({"dbt_model", "dbt_column", "lookml_view"}) + + self.assertIn("FROM information_schema.tables t", views["tables"]) + self.assertIn("FROM agents.dbt_tables", views["tables"]) + self.assertIn(") dbt", views["tables"]) + self.assertIn("FROM agents.lookml_tables", views["tables"]) + self.assertIn(") lookml", views["tables"]) + self.assertIn("dbt.description AS dbt_description", views["tables"]) + self.assertIn("lookml.ai_context AS lookml_ai_context", views["tables"]) + self.assertIn("LOWER(t.table_name) = LOWER(dbt.table_name)", views["tables"]) + self.assertIn("GROUP BY table_catalog, table_schema, table_name", views["tables"]) + self.assertIn("SUM(memories_count) AS memories_count", views["tables"]) + self.assertIn("t.last_ddl", views["tables"]) + self.assertIn("t.last_ddl_by", views["tables"]) + self.assertIn("t.auto_clustering_on", views["tables"]) + self.assertIn("t.is_hybrid", views["tables"]) + self.assertNotIn("FROM agents.dbt_model", views["tables"]) + + def test_core_columns_still_union_provider_normalized_columns(self): + views = build_context_view_sql({"dbt_model", "dbt_column", "lookml_view"}) + + self.assertIn("FROM agents.dbt_columns", views["columns"]) + self.assertNotIn("FROM agents.dbt_column c", views["columns"]) + + def test_builds_metric_view_from_provider_metric_views(self): + views = build_context_view_sql({"lookml_measure", "osi_metric"}) + + self.assertIn("FROM agents.lookml_metrics", views["metrics"]) + self.assertIn("FROM agents.osi_metrics", views["metrics"]) + self.assertIn("FROM agents.lookml_measure", views["lookml_metrics"]) + self.assertIn("FROM agents.osi_metric", views["osi_metrics"]) + + def test_dbt_relationships_use_model_names_for_table_endpoints(self): + views = build_context_view_sql({"dbt_model", "dbt_dependency"}) + + self.assertIn("JOIN agents.dbt_model upstream ON upstream.unique_id = d.upstream_id", views["dbt_relationships"]) + self.assertIn("JOIN agents.dbt_model downstream ON downstream.unique_id = d.downstream_id", views["dbt_relationships"]) + self.assertIn("upstream.name AS from_table", views["dbt_relationships"]) + self.assertIn("downstream.name AS to_table", views["dbt_relationships"]) + + def test_osi_relationships_use_dataset_source_tables_for_table_endpoints(self): + views = build_context_view_sql({"osi_dataset", "osi_relationship"}) + + self.assertIn("JOIN agents.osi_dataset from_dataset ON from_dataset.name = r.from_dataset", views["osi_relationships"]) + self.assertIn("JOIN agents.osi_dataset to_dataset ON to_dataset.name = r.to_dataset", views["osi_relationships"]) + self.assertIn("from_dataset.source_table AS from_table", views["osi_relationships"]) + self.assertIn("to_dataset.source_table AS to_table", views["osi_relationships"]) + + def test_osi_tables_parse_source_table_for_table_merge_identity(self): + views = build_context_view_sql({"osi_dataset"}) + + self.assertIn("REGEXP_COUNT(source_table, '[.]') = 2", views["osi_tables"]) + self.assertIn("THEN SPLIT_PART(source_table, '.', 1)", views["osi_tables"]) + self.assertIn("END AS table_catalog", views["osi_tables"]) + self.assertIn("END AS table_schema", views["osi_tables"]) + self.assertIn("THEN SPLIT_PART(source_table, '.', 3)", views["osi_tables"]) + self.assertIn("END AS table_name", views["osi_tables"]) + + def test_lookml_tables_parse_simple_sql_table_name(self): + views = build_context_view_sql({"lookml_view"}) + + self.assertIn("REGEXP_COUNT(sql_table_name, '[.]') = 2", views["lookml_tables"]) + self.assertIn("THEN SPLIT_PART(sql_table_name, '.', 1)", views["lookml_tables"]) + self.assertIn("END AS table_catalog", views["lookml_tables"]) + self.assertIn("END AS table_schema", views["lookml_tables"]) + self.assertIn("THEN SPLIT_PART(sql_table_name, '.', 3)", views["lookml_tables"]) + self.assertIn("END AS table_name", views["lookml_tables"]) + self.assertIn("ELSE name", views["lookml_tables"]) + + def test_lookml_columns_use_same_sql_table_name_identity_as_tables(self): + views = build_context_view_sql({"lookml_view", "lookml_dimension"}) + + self.assertIn("FROM agents.lookml_dimension d", views["lookml_columns"]) + self.assertIn("JOIN agents.lookml_view v ON v.name = d.view_name", views["lookml_columns"]) + self.assertIn("REGEXP_COUNT(v.sql_table_name, '[.]') = 2", views["lookml_columns"]) + self.assertIn("THEN SPLIT_PART(v.sql_table_name, '.', 3)", views["lookml_columns"]) + self.assertIn("ELSE v.name", views["lookml_columns"]) + + def test_empty_view_has_typed_projection(self): + views = build_context_view_sql(set()) + + self.assertIn("CAST(NULL AS VARCHAR) AS table_name", views["dbt_tables"]) + self.assertIn("WHERE 1 = 0", views["dbt_tables"]) + self.assertIn("FROM information_schema.tables t", views["tables"]) + self.assertIn("CAST(NULL AS VARCHAR) AS entity_id", views["entities"]) + + +if __name__ == "__main__": + unittest.main() From 6c9737799fb0d514f11cec980dc15503a9930b30 Mon Sep 17 00:00:00 2001 From: Dave Fowler Date: Sat, 30 May 2026 23:12:01 -0700 Subject: [PATCH 2/3] Narrow context views to v1: TABLES + COLUMNS, generic identity merge - scope to the surfaces information_schema already has (TABLES, COLUMNS); drop RELATIONSHIPS/METRICS/ENTITIES and their provider views (deferred; relationships should later extend REFERENTIAL_CONSTRAINTS/KEY_COLUMN_USAGE, not a custom view) - AGENTS.TABLES/COLUMNS use SELECT t.* over information_schema as the spine (no hardcoded native column list; inherits whatever the account exposes) - generic identity merge: left join every discovered {provider}_tables/_columns view, enrichment columns appended under a _ prefix, aggregated to one row per identity to prevent fanout; no hardcoded providers - remove hardcoded memories_count/warnings_count; memory participates later by publishing its own *_TABLES/*_COLUMNS view and is picked up automatically - view creation is fail-soft (warn, never break ingestion) - align osi_columns identity parsing with osi_tables; rename helper to _relation_identity_sql - update SPEC, proposal (v1 scope + resolved decisions), root entries, and tests Co-Authored-By: Claude Opus 4.8 --- SPEC.md | 110 ++------ proposals/agent-schema-views.md | 28 +- src/agents_schema/root.py | 11 +- src/agents_schema/views.py | 445 ++++++++++---------------------- tests/test_root.py | 6 +- tests/test_views.py | 119 ++++----- 6 files changed, 242 insertions(+), 477 deletions(-) diff --git a/SPEC.md b/SPEC.md index cd1fb2f..22b683b 100644 --- a/SPEC.md +++ b/SPEC.md @@ -89,108 +89,55 @@ The generic views are documented in `AGENTS.ROOT` under the `core` provider. | View | Purpose | |---|---| | `AGENTS.TABLES` | `INFORMATION_SCHEMA.TABLES` enriched with matching provider table context. | -| `AGENTS.COLUMNS` | Column/field-like analytical objects from provider tables. | -| `AGENTS.RELATIONSHIPS` | Lineage and semantic relationship edges from provider tables. | -| `AGENTS.METRICS` | Metric/measure-like semantic objects from provider tables. | -| `AGENTS.ENTITIES` | Reserved entity-oriented view; empty until a provider contributes entity metadata. | +| `AGENTS.COLUMNS` | `INFORMATION_SCHEMA.COLUMNS` enriched with matching provider column context. | --- ## Generic Context Views -The generic views are compatibility-oriented: the leading columns mirror common information-schema concepts where possible, and later columns add provider-derived context that has cross-provider meaning. Provider-normalized views are staging inputs, not new sources of truth. +### Scope -Provider-owned normalized views feed the generic views: +v1 extends the surfaces `INFORMATION_SCHEMA` already has — `TABLES` and `COLUMNS` — rather than inventing new object types. Relationships, metrics, and entities are intentionally out of scope: the information-schema-faithful home for relationships is the `REFERENTIAL_CONSTRAINTS` / `KEY_COLUMN_USAGE` family (a future extension), and metrics/entities are object types that semantic providers such as OSI already model in their own `AGENTS.OSI_*` tables. The generic views enrich; they do not become a competing semantic model. -| Generic view | Provider-normalized inputs | -|---|---| -| `AGENTS.TABLES` | `INFORMATION_SCHEMA.TABLES` left joined to `AGENTS.DBT_TABLES`, `AGENTS.LOOKML_TABLES`, `AGENTS.OSI_TABLES` | -| `AGENTS.COLUMNS` | `AGENTS.DBT_COLUMNS`, `AGENTS.LOOKML_COLUMNS`, `AGENTS.OSI_COLUMNS` | -| `AGENTS.RELATIONSHIPS` | `AGENTS.DBT_RELATIONSHIPS`, `AGENTS.OSI_RELATIONSHIPS` | -| `AGENTS.METRICS` | `AGENTS.LOOKML_METRICS`, `AGENTS.OSI_METRICS` | +### Merge model + +Each provider publishes a normalized `AGENTS._TABLES` / `AGENTS._COLUMNS` view with a shared shape. The generic views then take the native `INFORMATION_SCHEMA` view as the row spine via `SELECT t.*` (so they inherit whatever native columns the account exposes — nothing is hardcoded) and **left join every provider view that exists** by object identity: + +- `AGENTS.TABLES`: `INFORMATION_SCHEMA.TABLES` joined to each `*_TABLES` view on `table_catalog` / `table_schema` / `table_name`. +- `AGENTS.COLUMNS`: `INFORMATION_SCHEMA.COLUMNS` joined to each `*_COLUMNS` view on `table_catalog` / `table_schema` / `table_name` / `column_name`. + +The merge is generic and provider-agnostic. Each provider's enrichment columns are appended under a `_` prefix (`dbt_description`, `lookml_ai_context`, `osi_description`, …), so providers never collide and no native column is overwritten. Within a single provider, rows are aggregated to one row per object identity before the join, so duplicate provider rows cannot multiply native rows. A provider that ships a new `*_TABLES`/`*_COLUMNS` view later — for example a memory provider contributing `memory_*` counts — is picked up automatically with no change to the core views. -A provider participates in `AGENTS.TABLES` by publishing a matching provider-normalized `*_TABLES` view. `AGENTS.TABLES` uses `INFORMATION_SCHEMA.TABLES` as its row spine and left joins every provider-normalized table view by `table_catalog`, `table_schema`, and `table_name`. Provider-specific detail stays in source tables such as `AGENTS.LOOKML_DIMENSION`; the merged view exposes provider-prefixed context columns for the common fields worth carrying forward. +`SELECT t.*` resolves against the `INFORMATION_SCHEMA` of the database that holds the `AGENTS` schema, so `AGENTS.TABLES`/`COLUMNS` cover objects in that database. Provider-specific detail not promoted into the shared shape stays in the source tables (for example `AGENTS.LOOKML_DIMENSION`) and is reachable through the `_source_object_id` columns. ### `AGENTS.TABLES` -```sql -CREATE OR REPLACE VIEW AGENTS.TABLES AS ... -``` +`SELECT t.*` from `INFORMATION_SCHEMA.TABLES` plus, for each participating provider, the following prefixed columns: | Column | Description | |---|---| -| `table_catalog` through `comment` | Native columns from Snowflake `INFORMATION_SCHEMA.TABLES`. | -| `_display_name` | Provider label for the matched table, such as `dbt_display_name`. | +| `_table_type` | Provider object kind, such as `DBT_MODEL` or `OSI_DATASET`. | +| `_display_name` | Provider label for the matched table. | | `_description` | Provider description for the matched table. | | `_ai_context` | Provider AI context for the matched table. | -| `_source_object_id` | Provider-specific object identifier. | +| `_source_object_id` | Provider-specific object identifier(s). | | `_source_path` | Source file path when available. | | `_materialization` | Provider materialization when available. | | `_tags` | Provider tags when available. | -| `memories_count` | Sum of provider memory counts; reserved for memory-provider integration. | -| `warnings_count` | Sum of provider warning counts; reserved for memory-provider integration. | ### `AGENTS.COLUMNS` -| Column | Description | -|---|---| -| `table_catalog` | Catalog/database name when known. | -| `table_schema` | Schema name when known. | -| `table_name` | Parent table-like object name. | -| `column_name` | Column/field-like object name. | -| `ordinal_position` | Ordinal position when known. | -| `data_type` | Provider data type when known. | -| `is_nullable` | Nullability when known. | -| `display_name` | Human-facing label when available. | -| `description` | Provider description. | -| `ai_context` | Provider AI context when available. | -| `semantic_type` | Provider semantic field kind when available. | -| `is_time_dimension` | Whether the field is marked as time-like. | -| `expression` | Provider expression or SQL when available. | -| `source_provider` | Provider that contributed the row. | -| `source_object_id` | Provider-specific object identifier. | -| `memories_count` | Reserved for memory-provider integration. | -| `warnings_count` | Reserved for memory-provider integration. | - -### `AGENTS.RELATIONSHIPS` - -| Column | Description | -|---|---| -| `relationship_name` | Relationship or lineage edge name. | -| `from_catalog` | Source catalog/database when known. | -| `from_schema` | Source schema when known. | -| `from_table` | Source table/object. | -| `from_column` | Source column(s) when known. | -| `to_catalog` | Destination catalog/database when known. | -| `to_schema` | Destination schema when known. | -| `to_table` | Destination table/object. | -| `to_column` | Destination column(s) when known. | -| `relationship_type` | Relationship type, such as `lineage` or `semantic_relationship`. | -| `multiplicity` | Multiplicity when known. | -| `source_provider` | Provider that contributed the row. | -| `source_object_id` | Provider-specific object identifier. | -| `memories_count` | Reserved for memory-provider integration. | -| `warnings_count` | Reserved for memory-provider integration. | - -### `AGENTS.METRICS` +`SELECT t.*` from `INFORMATION_SCHEMA.COLUMNS` plus, for each participating provider, the following prefixed columns: | Column | Description | |---|---| -| `metric_name` | Metric or measure name. | -| `display_name` | Human-facing label when available. | -| `description` | Provider description. | -| `ai_context` | Provider AI context when available. | -| `expression` | Metric expression or SQL when available. | -| `source_provider` | Provider that contributed the row. | -| `source_object_id` | Provider-specific object identifier. | -| `dataset_name` | Parent dataset when available. | -| `view_name` | Parent LookML view when available. | -| `memories_count` | Reserved for memory-provider integration. | -| `warnings_count` | Reserved for memory-provider integration. | - -### `AGENTS.ENTITIES` - -`AGENTS.ENTITIES` is currently an empty typed view reserved for providers that contribute canonical entity metadata in a future release. +| `_display_name` | Provider label for the matched column. | +| `_description` | Provider description. | +| `_ai_context` | Provider AI context when available. | +| `_semantic_type` | Provider semantic field kind when available. | +| `_is_time_dimension` | Whether the field is marked time-like. | +| `_expression` | Provider expression or SQL when available. | +| `_source_object_id` | Provider-specific object identifier. | --- @@ -560,7 +507,7 @@ The current core provider name is: | Provider | Objects | |---|---| -| `core` | `AGENTS.ROOT`, `AGENTS.TABLES`, `AGENTS.COLUMNS`, `AGENTS.RELATIONSHIPS`, `AGENTS.METRICS`, `AGENTS.ENTITIES` | +| `core` | `AGENTS.ROOT`, `AGENTS.TABLES`, `AGENTS.COLUMNS` | --- @@ -569,11 +516,8 @@ The current core provider name is: | Table | Source | Purpose | |---|---|---| | `AGENTS.ROOT` | core | Provider registry upserted by dbt, LookML, and OSI workflows | -| `AGENTS.TABLES` | core | Generic table/object context view | -| `AGENTS.COLUMNS` | core | Generic column/field context view | -| `AGENTS.RELATIONSHIPS` | core | Generic relationship and lineage context view | -| `AGENTS.METRICS` | core | Generic metric and measure context view | -| `AGENTS.ENTITIES` | core | Reserved generic entity context view | +| `AGENTS.TABLES` | core | `INFORMATION_SCHEMA.TABLES` enriched from provider `*_TABLES` views | +| `AGENTS.COLUMNS` | core | `INFORMATION_SCHEMA.COLUMNS` enriched from provider `*_COLUMNS` views | | `AGENTS.DBT_MODEL` | dbt | dbt models with schema, materialization, documentation, path, and tags | | `AGENTS.DBT_COLUMN` | dbt | Documented dbt model columns | | `AGENTS.DBT_DEPENDENCY` | dbt | Direct dbt dependency edges | diff --git a/proposals/agent-schema-views.md b/proposals/agent-schema-views.md index 751b679..39bf05a 100644 --- a/proposals/agent-schema-views.md +++ b/proposals/agent-schema-views.md @@ -15,7 +15,17 @@ AGENTS.METRICS AGENTS.ENTITIES ``` -The goal is to make Agents Schema instantly swappable for common `INFORMATION_SCHEMA` exploration patterns while adding richer context. Anywhere an agent would normally ask `INFORMATION_SCHEMA.TABLES` or `INFORMATION_SCHEMA.COLUMNS`, it should be able to ask `AGENTS.TABLES` or `AGENTS.COLUMNS` instead and get the familiar shape plus dbt descriptions, LookML/OSI semantic metadata, memory counts, warnings, source provider references, and eventually profiling or usage context. +The goal is to make Agents Schema instantly swappable for common `INFORMATION_SCHEMA` exploration patterns while adding richer context. Anywhere an agent would normally ask `INFORMATION_SCHEMA.TABLES` or `INFORMATION_SCHEMA.COLUMNS`, it should be able to ask `AGENTS.TABLES` or `AGENTS.COLUMNS` instead and get the familiar shape plus dbt descriptions, LookML/OSI semantic metadata, source provider references, and eventually profiling or usage context. + +## v1 Scope (Implemented) + +The shipped v1 is deliberately narrower than the full proposal below, which is retained as the longer-term design sketch. + +- **Only the surfaces `INFORMATION_SCHEMA` already has — `AGENTS.TABLES` and `AGENTS.COLUMNS`.** `RELATIONSHIPS`, `METRICS`, and `ENTITIES` are deferred. They are *new* object types that semantic providers like OSI already model in their own tables; adding generic versions now would make this a competing semantic model rather than an information-schema extension. The information-schema-faithful home for relationships is the `REFERENTIAL_CONSTRAINTS` / `KEY_COLUMN_USAGE` family, which is the intended future shape rather than a custom `AGENTS.RELATIONSHIPS` view. +- **Native spine via `SELECT t.*`.** `AGENTS.TABLES`/`COLUMNS` select `t.*` from `INFORMATION_SCHEMA.TABLES`/`COLUMNS` and inherit whatever native columns the account exposes. No native column list is hardcoded. +- **Generic identity merge.** Each provider's `*_TABLES`/`*_COLUMNS` view is left joined by object identity (catalog/schema/table, plus column for columns), with its enrichment columns appended under a `_` prefix. The set of providers is discovered, not hardcoded. Within a provider, rows are aggregated to one per identity to prevent fanout. +- **No hardcoded memory counts.** Memory participation is purely additive: when a memory provider later publishes its own `*_TABLES`/`*_COLUMNS` view exposing counts, those columns appear automatically. The core views contain no memory-specific logic. +- **Fail-soft.** View creation runs at the end of each provider ingestion but never fails the ingestion; a view error warns and is skipped. ## Motivation @@ -350,10 +360,16 @@ Recommended first version: Later versions can add canonicalization if Agents Schema gains stable warehouse object identifiers. +## Resolved Decisions (v1) + +- **Warehouse views, refreshed per ingestion** (fail-soft), not CLI-materialized tables. +- **Native objects are the spine.** `TABLES`/`COLUMNS` start from `INFORMATION_SCHEMA` and enrich; they are not provider-only unions. +- **Memory counts are omitted until the memory provider ships its own view.** No reserved-but-zero columns. +- **Measures live in the deferred `METRICS` surface, not `COLUMNS`.** v1 columns are physical/field-like only. +- **dbt, LookML, and OSI all participate in v1** (LookML/OSI `sql_table_name`/`source_table` are parsed into identity). + ## Open Questions -- Should views be materialized by the CLI or created as warehouse views? -- Should the first implementation include only dbt/OSI and leave LookML table parsing out? -- Should `TABLES`/`COLUMNS` include native warehouse objects, or only objects contributed by source providers? -- Should memory counts require the memory provider, or should views omit those columns until memory ships? -- Should `AGENTS.COLUMNS` include measures, or should measures live only in `AGENTS.METRICS`? +- When relationships land, confirm the `REFERENTIAL_CONSTRAINTS` / `KEY_COLUMN_USAGE` shape over a custom view, including how unenforced/OSI relationships are represented when the native constraint views are empty. +- Cross-database coverage: `INFORMATION_SCHEMA` is per-database, so `AGENTS.TABLES`/`COLUMNS` only cover the database holding `AGENTS`. Should multi-database deployments use `SNOWFLAKE.ACCOUNT_USAGE` (account-wide, latent) as an alternate spine? +- Should provider enrichment be prefixed columns (current) or also offer a coalesced single `description`/`ai_context` with a trust order? diff --git a/src/agents_schema/root.py b/src/agents_schema/root.py index 9fe2ab7..9677452 100644 --- a/src/agents_schema/root.py +++ b/src/agents_schema/root.py @@ -19,11 +19,8 @@ "core": ( ("overview", "# Core\nShared Agents Schema registry and generic context views."), ("root", "Provider registry. See AGENTS.ROOT."), - ("tables", "Information-schema-like table/object context view enriched from provider *_TABLES views."), - ("columns", "Information-schema-like column/field context view unioned from provider *_COLUMNS views."), - ("relationships", "Relationship and lineage context view unioned from provider *_RELATIONSHIPS views."), - ("metrics", "Metric and measure context view unioned from provider *_METRICS views."), - ("entities", "Entity context view. Reserved until a provider contributes entity metadata. See AGENTS.ENTITIES."), + ("tables", "Information-schema-like table context view: information_schema.tables enriched from provider *_TABLES views. See AGENTS.TABLES."), + ("columns", "Information-schema-like column context view: information_schema.columns enriched from provider *_COLUMNS views. See AGENTS.COLUMNS."), ), "dbt": ( ("overview", "# dbt\nTransformation metadata from dbt manifest.json."), @@ -32,7 +29,6 @@ ("dependency", "Direct dbt DAG edges. See AGENTS.DBT_DEPENDENCY."), ("tables", "Provider-normalized table context view. See AGENTS.DBT_TABLES."), ("columns", "Provider-normalized column context view. See AGENTS.DBT_COLUMNS."), - ("relationships", "Provider-normalized relationship context view. See AGENTS.DBT_RELATIONSHIPS."), ), "lookml": ( ("overview", "# LookML\nSemantic metadata parsed from LookML files."), @@ -42,7 +38,6 @@ ("explore", "One row per LookML explore. See AGENTS.LOOKML_EXPLORE."), ("tables", "Provider-normalized table context view. See AGENTS.LOOKML_TABLES."), ("columns", "Provider-normalized column context view. See AGENTS.LOOKML_COLUMNS."), - ("metrics", "Provider-normalized metric context view. See AGENTS.LOOKML_METRICS."), ), "osi": ( ("overview", "# OSI\nOpen Semantic Interchange metadata parsed from *.osi.yaml files."), @@ -52,8 +47,6 @@ ("relationship", "One row per OSI relationship. See AGENTS.OSI_RELATIONSHIP."), ("tables", "Provider-normalized table context view. See AGENTS.OSI_TABLES."), ("columns", "Provider-normalized column context view. See AGENTS.OSI_COLUMNS."), - ("relationships", "Provider-normalized relationship context view. See AGENTS.OSI_RELATIONSHIPS."), - ("metrics", "Provider-normalized metric context view. See AGENTS.OSI_METRICS."), ), } diff --git a/src/agents_schema/views.py b/src/agents_schema/views.py index 918161c..7d2e9f9 100644 --- a/src/agents_schema/views.py +++ b/src/agents_schema/views.py @@ -1,6 +1,26 @@ -"""Information-schema-like context views over provider-normalized views.""" +"""Information-schema-like context views over provider-normalized views. + +v1 scope: extend the surfaces ``INFORMATION_SCHEMA`` already has — `TABLES` and +`COLUMNS` — rather than inventing new object types. Each metadata provider +publishes a normalized ``AGENTS._TABLES`` / ``AGENTS._COLUMNS`` +view with a shared shape. ``AGENTS.TABLES`` and ``AGENTS.COLUMNS`` then take the +native ``INFORMATION_SCHEMA`` view as the row spine (``SELECT t.*``) and merge +**every** provider view that exists by object identity, appending each +provider's columns under a ``_`` prefix. + +The merge is generic: no native column list is hardcoded (``SELECT t.*`` inherits +whatever the account exposes), and no provider is special-cased. A provider that +ships a new ``*_TABLES`` view later — e.g. a memory provider contributing +``memories_count`` — is picked up automatically with no change here. + +Relationships and metrics are intentionally out of scope for v1. The +information-schema-faithful home for relationships is the +``REFERENTIAL_CONSTRAINTS`` / ``KEY_COLUMN_USAGE`` family; see the proposal. +""" from __future__ import annotations +import sys + from .destinations import Destination from .root import upsert_provider_root @@ -11,40 +31,40 @@ "build_context_view_sql", ] -CORE_VIEW_NAMES = frozenset({"tables", "columns", "relationships", "metrics", "entities"}) +CORE_VIEW_NAMES = frozenset({"tables", "columns"}) PROVIDER_VIEW_NAMES = frozenset( { "dbt_tables", "dbt_columns", - "dbt_relationships", "lookml_tables", "lookml_columns", - "lookml_metrics", "osi_tables", "osi_columns", - "osi_relationships", - "osi_metrics", } ) -_LOOKML_RELATION_RE = r"^[A-Za-z_][A-Za-z0-9_$]*([.][A-Za-z_][A-Za-z0-9_$]*){0,2}$" +_RELATION_RE = r"^[A-Za-z_][A-Za-z0-9_$]*([.][A-Za-z_][A-Za-z0-9_$]*){0,2}$" def create_context_views(dest: Destination) -> None: - """Create provider-normalized views and generic context views.""" + """Create provider-normalized views and the generic context views. + + Fail-soft: a view that cannot be created warns but never breaks the + surrounding ingestion, which has already written the provider tables. + """ upsert_provider_root(dest, "core") for name, sql in build_context_view_sql(dest.existing_table_names()).items(): - dest.replace_view(name, sql) + try: + dest.replace_view(name, sql) + except Exception as e: # noqa: BLE001 - the view layer must not fail ingestion + print(f" warning: could not create view agents.{name}: {e}", file=sys.stderr) def build_context_view_sql(existing_tables: set[str]) -> dict[str, str]: existing = {name.lower() for name in existing_tables} provider_views = _provider_view_sql(existing) return provider_views | { - "tables": _merge_table_views(provider_views), - "columns": _union_provider_views(provider_views, "columns", _COLUMN_COLUMNS), - "relationships": _union_provider_views(provider_views, "relationships", _RELATIONSHIP_COLUMNS), - "metrics": _union_provider_views(provider_views, "metrics", _METRIC_COLUMNS), - "entities": _union_provider_views(provider_views, "entities", _ENTITY_COLUMNS), + "tables": _merge_view(provider_views, "tables", "information_schema.tables", _TABLE_IDENTITY, _TABLE_MERGE), + "columns": _merge_view(provider_views, "columns", "information_schema.columns", _COLUMN_IDENTITY, _COLUMN_MERGE), } @@ -52,159 +72,104 @@ def _provider_view_sql(existing: set[str]) -> dict[str, str]: return { "dbt_tables": _dbt_tables_sql(existing), "dbt_columns": _dbt_columns_sql(existing), - "dbt_relationships": _dbt_relationships_sql(existing), "lookml_tables": _lookml_tables_sql(existing), "lookml_columns": _lookml_columns_sql(existing), - "lookml_metrics": _lookml_metrics_sql(existing), "osi_tables": _osi_tables_sql(existing), "osi_columns": _osi_columns_sql(existing), - "osi_relationships": _osi_relationships_sql(existing), - "osi_metrics": _osi_metrics_sql(existing), } -def _union_provider_views(provider_views: dict[str, str], suffix: str, columns: list[tuple[str, str]]) -> str: +# --- generic merge over the native information_schema spine ------------------ + + +def _merge_view( + provider_views: dict[str, str], + suffix: str, + spine: str, + identity: tuple[str, ...], + merge_columns: tuple[str, ...], +) -> str: + views = [name for name in provider_views if name.endswith(f"_{suffix}")] selects = [ - f"SELECT {', '.join(name for name, _ in columns)}\nFROM agents.{view_name}" - for view_name in provider_views - if view_name.endswith(f"_{suffix}") + ",\n ".join(f"{alias}.{column} AS {alias}_{column}" for column in merge_columns) + for alias in (_provider_alias(name, suffix) for name in views) ] - return _union_or_empty(selects, columns) + joins = "\n".join(_merge_join(name, _provider_alias(name, suffix), identity, merge_columns) for name in views) + enrichment = (",\n " + ",\n ".join(selects)) if selects else "" + return f"SELECT\n t.*{enrichment}\nFROM {spine} t\n{joins}" -def _merge_table_views(provider_views: dict[str, str]) -> str: - table_views = [view_name for view_name in provider_views if view_name.endswith("_tables")] - provider_selects = [ - _provider_table_select(alias) - for alias in (_provider_alias(view_name) for view_name in table_views) - ] - count_selects = [ - _provider_table_count_select(alias, "memories_count") - for alias in (_provider_alias(view_name) for view_name in table_views) - ] - warning_selects = [ - _provider_table_count_select(alias, "warnings_count") - for alias in (_provider_alias(view_name) for view_name in table_views) - ] - joins = "\n".join( - _provider_table_join(view_name, _provider_alias(view_name)) - for view_name in table_views +def _provider_alias(view_name: str, suffix: str) -> str: + return view_name.removesuffix(f"_{suffix}") + + +def _merge_join(view_name: str, alias: str, identity: tuple[str, ...], merge_columns: tuple[str, ...]) -> str: + id_select = ",\n ".join(identity) + agg_select = ",\n ".join(f"{_agg(column)} AS {column}" for column in merge_columns) + group_by = ", ".join(identity) + required = [column for column in identity if column not in ("table_catalog", "table_schema")] + where = " AND ".join(f"{column} IS NOT NULL" for column in required) + on = "\n AND ".join(_merge_on(alias, column) for column in identity) + return ( + f"LEFT JOIN (\n" + f" SELECT\n {id_select},\n {agg_select}\n" + f" FROM agents.{view_name}\n" + f" WHERE {where}\n" + f" GROUP BY {group_by}\n" + f") {alias}\n ON {on}" ) - return f"""SELECT - t.table_catalog, - t.table_schema, - t.table_name, - t.table_owner, - t.table_type, - t.is_transient, - t.clustering_key, - t.row_count, - t.bytes, - t.retention_time, - t.self_referencing_column_name, - t.reference_generation, - t.user_defined_type_catalog, - t.user_defined_type_schema, - t.user_defined_type_name, - t.is_insertable_into, - t.is_typed, - t.commit_action, - t.created, - t.last_altered, - t.last_ddl, - t.last_ddl_by, - t.auto_clustering_on, - t.comment, - t.is_temporary, - t.is_iceberg, - t.is_dynamic, - t.is_immutable, - t.is_hybrid, - {",\n ".join(provider_selects)}, - {" + ".join(count_selects)} AS memories_count, - {" + ".join(warning_selects)} AS warnings_count -FROM information_schema.tables t -{joins}""" - - -def _provider_alias(view_name: str) -> str: - return view_name.removesuffix("_tables") - - -def _provider_table_select(alias: str) -> str: - return f"""{alias}.display_name AS {alias}_display_name, - {alias}.description AS {alias}_description, - {alias}.ai_context AS {alias}_ai_context, - {alias}.source_object_id AS {alias}_source_object_id, - {alias}.source_path AS {alias}_source_path, - {alias}.materialization AS {alias}_materialization, - {alias}.tags AS {alias}_tags""" - - -def _provider_table_count_select(alias: str, column: str) -> str: - return f"COALESCE({alias}.{column}, 0)" - - -def _provider_table_join(view_name: str, alias: str) -> str: - return f"""LEFT JOIN ( - SELECT - table_catalog, - table_schema, - table_name, - MIN(display_name) AS display_name, - MIN(description) AS description, - MIN(ai_context) AS ai_context, - LISTAGG(source_object_id, ', ') WITHIN GROUP (ORDER BY source_object_id) AS source_object_id, - LISTAGG(source_path, ', ') WITHIN GROUP (ORDER BY source_path) AS source_path, - MIN(materialization) AS materialization, - ANY_VALUE(tags) AS tags, - SUM(memories_count) AS memories_count, - SUM(warnings_count) AS warnings_count - FROM agents.{view_name} - WHERE table_schema IS NOT NULL - AND table_name IS NOT NULL - GROUP BY table_catalog, table_schema, table_name -) {alias} - ON ({alias}.table_catalog IS NULL OR LOWER(t.table_catalog) = LOWER({alias}.table_catalog)) - AND LOWER(t.table_schema) = LOWER({alias}.table_schema) - AND LOWER(t.table_name) = LOWER({alias}.table_name)""" - - -def _union_or_empty(selects: list[str], columns: list[tuple[str, str]]) -> str: - if selects: - return "\nUNION ALL\n".join(selects) + + +def _merge_on(alias: str, column: str) -> str: + if column == "table_catalog": + return f"({alias}.{column} IS NULL OR LOWER(t.{column}) = LOWER({alias}.{column}))" + return f"LOWER(t.{column}) = LOWER({alias}.{column})" + + +def _agg(column: str) -> str: + if column == "tags": + return f"ANY_VALUE({column})" + if column in ("source_object_id", "source_path"): + return f"LISTAGG({column}, ', ') WITHIN GROUP (ORDER BY {column})" + return f"MIN({column})" + + +def _empty_view(columns: list[tuple[str, str]]) -> str: projection = ",\n ".join(f"CAST(NULL AS {kind}) AS {name}" for name, kind in columns) return f"SELECT\n {projection}\nWHERE 1 = 0" -def _lookml_relation_identity_sql(sql_table_name: str, fallback_name: str) -> tuple[str, str, str]: - relation_is_simple = f"REGEXP_LIKE({sql_table_name}, '{_LOOKML_RELATION_RE}')" - part_count = f"REGEXP_COUNT({sql_table_name}, '[.]')" +def _relation_identity_sql(relation: str, fallback_name: str) -> tuple[str, str, str]: + """Split a 1-, 2-, or 3-part relation reference into catalog/schema/table.""" + is_simple = f"REGEXP_LIKE({relation}, '{_RELATION_RE}')" + part_count = f"REGEXP_COUNT({relation}, '[.]')" return ( f"""CASE - WHEN {relation_is_simple} AND {part_count} = 2 - THEN SPLIT_PART({sql_table_name}, '.', 1) + WHEN {is_simple} AND {part_count} = 2 + THEN SPLIT_PART({relation}, '.', 1) ELSE CAST(NULL AS VARCHAR) END AS table_catalog""", f"""CASE - WHEN {relation_is_simple} AND {part_count} = 2 - THEN SPLIT_PART({sql_table_name}, '.', 2) - WHEN {relation_is_simple} AND {part_count} = 1 - THEN SPLIT_PART({sql_table_name}, '.', 1) + WHEN {is_simple} AND {part_count} = 2 + THEN SPLIT_PART({relation}, '.', 2) + WHEN {is_simple} AND {part_count} = 1 + THEN SPLIT_PART({relation}, '.', 1) ELSE CAST(NULL AS VARCHAR) END AS table_schema""", f"""CASE - WHEN {relation_is_simple} AND {part_count} = 2 - THEN SPLIT_PART({sql_table_name}, '.', 3) - WHEN {relation_is_simple} AND {part_count} = 1 - THEN SPLIT_PART({sql_table_name}, '.', 2) - WHEN {relation_is_simple} AND {part_count} = 0 - THEN {sql_table_name} + WHEN {is_simple} AND {part_count} = 2 + THEN SPLIT_PART({relation}, '.', 3) + WHEN {is_simple} AND {part_count} = 1 + THEN SPLIT_PART({relation}, '.', 2) + WHEN {is_simple} AND {part_count} = 0 + THEN {relation} ELSE {fallback_name} END AS table_name""", ) +# --- provider-normalized view shapes ----------------------------------------- + _TABLE_COLUMNS = [ ("table_catalog", "VARCHAR"), ("table_schema", "VARCHAR"), @@ -218,18 +183,17 @@ def _lookml_relation_identity_sql(sql_table_name: str, fallback_name: str) -> tu ("source_path", "VARCHAR"), ("materialization", "VARCHAR"), ("tags", "VARIANT"), - ("memories_count", "NUMBER"), - ("warnings_count", "NUMBER"), ] +_TABLE_IDENTITY = ("table_catalog", "table_schema", "table_name") +_TABLE_MERGE = tuple( + name for name, _ in _TABLE_COLUMNS if name not in _TABLE_IDENTITY and name != "source_provider" +) _COLUMN_COLUMNS = [ ("table_catalog", "VARCHAR"), ("table_schema", "VARCHAR"), ("table_name", "VARCHAR"), ("column_name", "VARCHAR"), - ("ordinal_position", "NUMBER"), - ("data_type", "VARCHAR"), - ("is_nullable", "BOOLEAN"), ("display_name", "VARCHAR"), ("description", "TEXT"), ("ai_context", "TEXT"), @@ -238,59 +202,16 @@ def _lookml_relation_identity_sql(sql_table_name: str, fallback_name: str) -> tu ("expression", "TEXT"), ("source_provider", "VARCHAR"), ("source_object_id", "VARCHAR"), - ("memories_count", "NUMBER"), - ("warnings_count", "NUMBER"), -] - -_RELATIONSHIP_COLUMNS = [ - ("relationship_name", "VARCHAR"), - ("from_catalog", "VARCHAR"), - ("from_schema", "VARCHAR"), - ("from_table", "VARCHAR"), - ("from_column", "VARCHAR"), - ("to_catalog", "VARCHAR"), - ("to_schema", "VARCHAR"), - ("to_table", "VARCHAR"), - ("to_column", "VARCHAR"), - ("relationship_type", "VARCHAR"), - ("multiplicity", "VARCHAR"), - ("source_provider", "VARCHAR"), - ("source_object_id", "VARCHAR"), - ("memories_count", "NUMBER"), - ("warnings_count", "NUMBER"), -] - -_METRIC_COLUMNS = [ - ("metric_name", "VARCHAR"), - ("display_name", "VARCHAR"), - ("description", "TEXT"), - ("ai_context", "TEXT"), - ("expression", "TEXT"), - ("source_provider", "VARCHAR"), - ("source_object_id", "VARCHAR"), - ("dataset_name", "VARCHAR"), - ("view_name", "VARCHAR"), - ("memories_count", "NUMBER"), - ("warnings_count", "NUMBER"), -] - -_ENTITY_COLUMNS = [ - ("entity_id", "VARCHAR"), - ("display_name", "VARCHAR"), - ("description", "TEXT"), - ("source_provider", "VARCHAR"), - ("source_object_id", "VARCHAR"), - ("primary_table_schema", "VARCHAR"), - ("primary_table_name", "VARCHAR"), - ("primary_key_columns", "VARIANT"), - ("memories_count", "NUMBER"), - ("warnings_count", "NUMBER"), ] +_COLUMN_IDENTITY = ("table_catalog", "table_schema", "table_name", "column_name") +_COLUMN_MERGE = tuple( + name for name, _ in _COLUMN_COLUMNS if name not in _COLUMN_IDENTITY and name != "source_provider" +) def _dbt_tables_sql(existing: set[str]) -> str: if "dbt_model" not in existing: - return _union_or_empty([], _TABLE_COLUMNS) + return _empty_view(_TABLE_COLUMNS) return """SELECT CAST(NULL AS VARCHAR) AS table_catalog, schema_name AS table_schema, @@ -303,23 +224,18 @@ def _dbt_tables_sql(existing: set[str]) -> str: unique_id AS source_object_id, file_path AS source_path, materialization, - tags, - 0 AS memories_count, - 0 AS warnings_count + tags FROM agents.dbt_model""" def _dbt_columns_sql(existing: set[str]) -> str: if not {"dbt_model", "dbt_column"}.issubset(existing): - return _union_or_empty([], _COLUMN_COLUMNS) + return _empty_view(_COLUMN_COLUMNS) return """SELECT CAST(NULL AS VARCHAR) AS table_catalog, m.schema_name AS table_schema, m.name AS table_name, c.column_name, - CAST(NULL AS NUMBER) AS ordinal_position, - c.data_type, - CAST(NULL AS BOOLEAN) AS is_nullable, c.column_name AS display_name, c.description, CAST(NULL AS TEXT) AS ai_context, @@ -327,41 +243,15 @@ def _dbt_columns_sql(existing: set[str]) -> str: CAST(NULL AS BOOLEAN) AS is_time_dimension, CAST(NULL AS TEXT) AS expression, 'dbt' AS source_provider, - c.model_id || '.' || c.column_name AS source_object_id, - 0 AS memories_count, - 0 AS warnings_count + c.model_id || '.' || c.column_name AS source_object_id FROM agents.dbt_column c JOIN agents.dbt_model m ON m.unique_id = c.model_id""" -def _dbt_relationships_sql(existing: set[str]) -> str: - if not {"dbt_dependency", "dbt_model"}.issubset(existing): - return _union_or_empty([], _RELATIONSHIP_COLUMNS) - return """SELECT - d.upstream_id || ' -> ' || d.downstream_id AS relationship_name, - CAST(NULL AS VARCHAR) AS from_catalog, - upstream.schema_name AS from_schema, - upstream.name AS from_table, - CAST(NULL AS VARCHAR) AS from_column, - CAST(NULL AS VARCHAR) AS to_catalog, - downstream.schema_name AS to_schema, - downstream.name AS to_table, - CAST(NULL AS VARCHAR) AS to_column, - 'lineage' AS relationship_type, - CAST(NULL AS VARCHAR) AS multiplicity, - 'dbt' AS source_provider, - d.upstream_id || ' -> ' || d.downstream_id AS source_object_id, - 0 AS memories_count, - 0 AS warnings_count -FROM agents.dbt_dependency d -JOIN agents.dbt_model upstream ON upstream.unique_id = d.upstream_id -JOIN agents.dbt_model downstream ON downstream.unique_id = d.downstream_id""" - - def _lookml_tables_sql(existing: set[str]) -> str: if "lookml_view" not in existing: - return _union_or_empty([], _TABLE_COLUMNS) - catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("sql_table_name", "name") + return _empty_view(_TABLE_COLUMNS) + catalog_sql, schema_sql, table_sql = _relation_identity_sql("sql_table_name", "name") return f"""SELECT {catalog_sql}, {schema_sql}, @@ -374,24 +264,19 @@ def _lookml_tables_sql(existing: set[str]) -> str: name AS source_object_id, file_path AS source_path, CAST(NULL AS VARCHAR) AS materialization, - PARSE_JSON('[]') AS tags, - 0 AS memories_count, - 0 AS warnings_count + PARSE_JSON('[]') AS tags FROM agents.lookml_view""" def _lookml_columns_sql(existing: set[str]) -> str: if not {"lookml_dimension", "lookml_view"}.issubset(existing): - return _union_or_empty([], _COLUMN_COLUMNS) - catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("v.sql_table_name", "v.name") + return _empty_view(_COLUMN_COLUMNS) + catalog_sql, schema_sql, table_sql = _relation_identity_sql("v.sql_table_name", "v.name") return f"""SELECT {catalog_sql}, {schema_sql}, {table_sql}, d.field_name AS column_name, - CAST(NULL AS NUMBER) AS ordinal_position, - d.type AS data_type, - CAST(NULL AS BOOLEAN) AS is_nullable, d.field_name AS display_name, d.description, d.ai_context, @@ -399,35 +284,15 @@ def _lookml_columns_sql(existing: set[str]) -> str: d.field_kind = 'dimension_group' AS is_time_dimension, d.sql AS expression, 'lookml' AS source_provider, - d.view_name || '.' || d.field_name AS source_object_id, - 0 AS memories_count, - 0 AS warnings_count + d.view_name || '.' || d.field_name AS source_object_id FROM agents.lookml_dimension d JOIN agents.lookml_view v ON v.name = d.view_name""" -def _lookml_metrics_sql(existing: set[str]) -> str: - if "lookml_measure" not in existing: - return _union_or_empty([], _METRIC_COLUMNS) - return """SELECT - measure_name AS metric_name, - measure_name AS display_name, - description, - ai_context, - COALESCE(sql, filters) AS expression, - 'lookml' AS source_provider, - view_name || '.' || measure_name AS source_object_id, - CAST(NULL AS VARCHAR) AS dataset_name, - view_name, - 0 AS memories_count, - 0 AS warnings_count -FROM agents.lookml_measure""" - - def _osi_tables_sql(existing: set[str]) -> str: if "osi_dataset" not in existing: - return _union_or_empty([], _TABLE_COLUMNS) - catalog_sql, schema_sql, table_sql = _lookml_relation_identity_sql("source_table", "name") + return _empty_view(_TABLE_COLUMNS) + catalog_sql, schema_sql, table_sql = _relation_identity_sql("source_table", "name") return f"""SELECT {catalog_sql}, {schema_sql}, @@ -440,23 +305,19 @@ def _osi_tables_sql(existing: set[str]) -> str: name AS source_object_id, CAST(NULL AS VARCHAR) AS source_path, CAST(NULL AS VARCHAR) AS materialization, - PARSE_JSON('[]') AS tags, - 0 AS memories_count, - 0 AS warnings_count + PARSE_JSON('[]') AS tags FROM agents.osi_dataset""" def _osi_columns_sql(existing: set[str]) -> str: if not {"osi_dataset", "osi_field"}.issubset(existing): - return _union_or_empty([], _COLUMN_COLUMNS) - return """SELECT - CAST(NULL AS VARCHAR) AS table_catalog, - CAST(NULL AS VARCHAR) AS table_schema, - d.source_table AS table_name, + return _empty_view(_COLUMN_COLUMNS) + catalog_sql, schema_sql, table_sql = _relation_identity_sql("d.source_table", "d.name") + return f"""SELECT + {catalog_sql}, + {schema_sql}, + {table_sql}, f.field_name AS column_name, - CAST(NULL AS NUMBER) AS ordinal_position, - CAST(NULL AS VARCHAR) AS data_type, - CAST(NULL AS BOOLEAN) AS is_nullable, COALESCE(f.label, f.field_name) AS display_name, f.description, f.ai_context, @@ -464,50 +325,6 @@ def _osi_columns_sql(existing: set[str]) -> str: f.is_time_dimension, f.expression, 'osi' AS source_provider, - f.dataset_name || '.' || f.field_name AS source_object_id, - 0 AS memories_count, - 0 AS warnings_count + f.dataset_name || '.' || f.field_name AS source_object_id FROM agents.osi_field f JOIN agents.osi_dataset d ON d.name = f.dataset_name""" - - -def _osi_relationships_sql(existing: set[str]) -> str: - if not {"osi_dataset", "osi_relationship"}.issubset(existing): - return _union_or_empty([], _RELATIONSHIP_COLUMNS) - return """SELECT - r.name AS relationship_name, - CAST(NULL AS VARCHAR) AS from_catalog, - CAST(NULL AS VARCHAR) AS from_schema, - from_dataset.source_table AS from_table, - r.from_columns::TEXT AS from_column, - CAST(NULL AS VARCHAR) AS to_catalog, - CAST(NULL AS VARCHAR) AS to_schema, - to_dataset.source_table AS to_table, - r.to_columns::TEXT AS to_column, - 'semantic_relationship' AS relationship_type, - CAST(NULL AS VARCHAR) AS multiplicity, - 'osi' AS source_provider, - r.name AS source_object_id, - 0 AS memories_count, - 0 AS warnings_count -FROM agents.osi_relationship r -JOIN agents.osi_dataset from_dataset ON from_dataset.name = r.from_dataset -JOIN agents.osi_dataset to_dataset ON to_dataset.name = r.to_dataset""" - - -def _osi_metrics_sql(existing: set[str]) -> str: - if "osi_metric" not in existing: - return _union_or_empty([], _METRIC_COLUMNS) - return """SELECT - name AS metric_name, - name AS display_name, - description, - ai_context, - expression, - 'osi' AS source_provider, - name AS source_object_id, - CAST(NULL AS VARCHAR) AS dataset_name, - CAST(NULL AS VARCHAR) AS view_name, - 0 AS memories_count, - 0 AS warnings_count -FROM agents.osi_metric""" diff --git a/tests/test_root.py b/tests/test_root.py index 8c4ed4b..7b1cb6a 100644 --- a/tests/test_root.py +++ b/tests/test_root.py @@ -24,7 +24,7 @@ def test_upsert_provider_root_writes_only_requested_provider(self): self.assertEqual({row[0] for row in rows}, {"dbt"}) self.assertEqual( {row[1] for row in rows}, - {"overview", "model", "column", "dependency", "tables", "columns", "relationships"}, + {"overview", "model", "column", "dependency", "tables", "columns"}, ) def test_upsert_provider_root_has_osi_entries(self): @@ -35,7 +35,7 @@ def test_upsert_provider_root_has_osi_entries(self): _, rows = dest.upserts[0] self.assertEqual( {row[1] for row in rows}, - {"overview", "dataset", "field", "metric", "relationship", "tables", "columns", "relationships", "metrics"}, + {"overview", "dataset", "field", "metric", "relationship", "tables", "columns"}, ) def test_upsert_provider_root_has_core_view_entries(self): @@ -46,7 +46,7 @@ def test_upsert_provider_root_has_core_view_entries(self): _, rows = dest.upserts[0] self.assertEqual( {row[1] for row in rows}, - {"overview", "root", "tables", "columns", "relationships", "metrics", "entities"}, + {"overview", "root", "tables", "columns"}, ) diff --git a/tests/test_views.py b/tests/test_views.py index 47b9e54..3f34757 100644 --- a/tests/test_views.py +++ b/tests/test_views.py @@ -4,71 +4,70 @@ class ContextViewSqlTests(unittest.TestCase): - def test_builds_provider_views_from_raw_provider_tables(self): + def test_builds_only_tables_and_columns_surfaces(self): views = build_context_view_sql({"dbt_model", "dbt_column"}) self.assertEqual(PROVIDER_VIEW_NAMES | CORE_VIEW_NAMES, set(views)) + self.assertEqual(CORE_VIEW_NAMES, {"tables", "columns"}) + # relationships / metrics / entities are out of scope for v1 + self.assertNotIn("relationships", views) + self.assertNotIn("metrics", views) + self.assertNotIn("entities", views) + + def test_builds_provider_views_from_raw_provider_tables(self): + views = build_context_view_sql({"dbt_model", "dbt_column"}) + self.assertIn("FROM agents.dbt_model", views["dbt_tables"]) self.assertIn("FROM agents.dbt_column c", views["dbt_columns"]) - def test_core_tables_enriches_information_schema_with_provider_tables(self): - views = build_context_view_sql({"dbt_model", "dbt_column", "lookml_view"}) - - self.assertIn("FROM information_schema.tables t", views["tables"]) - self.assertIn("FROM agents.dbt_tables", views["tables"]) - self.assertIn(") dbt", views["tables"]) - self.assertIn("FROM agents.lookml_tables", views["tables"]) - self.assertIn(") lookml", views["tables"]) - self.assertIn("dbt.description AS dbt_description", views["tables"]) - self.assertIn("lookml.ai_context AS lookml_ai_context", views["tables"]) - self.assertIn("LOWER(t.table_name) = LOWER(dbt.table_name)", views["tables"]) - self.assertIn("GROUP BY table_catalog, table_schema, table_name", views["tables"]) - self.assertIn("SUM(memories_count) AS memories_count", views["tables"]) - self.assertIn("t.last_ddl", views["tables"]) - self.assertIn("t.last_ddl_by", views["tables"]) - self.assertIn("t.auto_clustering_on", views["tables"]) - self.assertIn("t.is_hybrid", views["tables"]) - self.assertNotIn("FROM agents.dbt_model", views["tables"]) - - def test_core_columns_still_union_provider_normalized_columns(self): - views = build_context_view_sql({"dbt_model", "dbt_column", "lookml_view"}) - - self.assertIn("FROM agents.dbt_columns", views["columns"]) - self.assertNotIn("FROM agents.dbt_column c", views["columns"]) - - def test_builds_metric_view_from_provider_metric_views(self): - views = build_context_view_sql({"lookml_measure", "osi_metric"}) - - self.assertIn("FROM agents.lookml_metrics", views["metrics"]) - self.assertIn("FROM agents.osi_metrics", views["metrics"]) - self.assertIn("FROM agents.lookml_measure", views["lookml_metrics"]) - self.assertIn("FROM agents.osi_metric", views["osi_metrics"]) - - def test_dbt_relationships_use_model_names_for_table_endpoints(self): - views = build_context_view_sql({"dbt_model", "dbt_dependency"}) + def test_core_tables_uses_information_schema_star_spine(self): + views = build_context_view_sql({"dbt_model", "lookml_view", "osi_dataset"}) + tables = views["tables"] + + # native spine is SELECT t.* — no hardcoded information_schema column list + self.assertIn("SELECT\n t.*", tables) + self.assertIn("FROM information_schema.tables t", tables) + self.assertNotIn("t.is_hybrid", tables) + self.assertNotIn("t.last_ddl", tables) + + def test_core_tables_merges_all_provider_tables_by_identity(self): + views = build_context_view_sql({"dbt_model", "lookml_view", "osi_dataset"}) + tables = views["tables"] + + # one prefixed enrichment column per provider, joined by identity + self.assertIn("dbt.description AS dbt_description", tables) + self.assertIn("lookml.ai_context AS lookml_ai_context", tables) + self.assertIn("osi.description AS osi_description", tables) + self.assertIn("FROM agents.dbt_tables", tables) + self.assertIn("FROM agents.osi_tables", tables) + self.assertIn("LOWER(t.table_name) = LOWER(dbt.table_name)", tables) + self.assertIn("GROUP BY table_catalog, table_schema, table_name", tables) + # generic merge: no hardcoded memory counts anywhere + self.assertNotIn("memories_count", tables) + self.assertNotIn("warnings_count", tables) + + def test_core_columns_merges_by_column_identity(self): + views = build_context_view_sql({"dbt_model", "dbt_column"}) + columns = views["columns"] - self.assertIn("JOIN agents.dbt_model upstream ON upstream.unique_id = d.upstream_id", views["dbt_relationships"]) - self.assertIn("JOIN agents.dbt_model downstream ON downstream.unique_id = d.downstream_id", views["dbt_relationships"]) - self.assertIn("upstream.name AS from_table", views["dbt_relationships"]) - self.assertIn("downstream.name AS to_table", views["dbt_relationships"]) + self.assertIn("SELECT\n t.*", columns) + self.assertIn("FROM information_schema.columns t", columns) + self.assertIn("FROM agents.dbt_columns", columns) + self.assertIn("LOWER(t.column_name) = LOWER(dbt.column_name)", columns) + self.assertIn("dbt.description AS dbt_description", columns) - def test_osi_relationships_use_dataset_source_tables_for_table_endpoints(self): - views = build_context_view_sql({"osi_dataset", "osi_relationship"}) + def test_dbt_relationships_view_is_gone(self): + views = build_context_view_sql({"dbt_model", "dbt_dependency"}) - self.assertIn("JOIN agents.osi_dataset from_dataset ON from_dataset.name = r.from_dataset", views["osi_relationships"]) - self.assertIn("JOIN agents.osi_dataset to_dataset ON to_dataset.name = r.to_dataset", views["osi_relationships"]) - self.assertIn("from_dataset.source_table AS from_table", views["osi_relationships"]) - self.assertIn("to_dataset.source_table AS to_table", views["osi_relationships"]) + self.assertNotIn("dbt_relationships", views) - def test_osi_tables_parse_source_table_for_table_merge_identity(self): - views = build_context_view_sql({"osi_dataset"}) + def test_osi_columns_parse_source_table_like_osi_tables(self): + views = build_context_view_sql({"osi_dataset", "osi_field"}) - self.assertIn("REGEXP_COUNT(source_table, '[.]') = 2", views["osi_tables"]) - self.assertIn("THEN SPLIT_PART(source_table, '.', 1)", views["osi_tables"]) - self.assertIn("END AS table_catalog", views["osi_tables"]) - self.assertIn("END AS table_schema", views["osi_tables"]) - self.assertIn("THEN SPLIT_PART(source_table, '.', 3)", views["osi_tables"]) - self.assertIn("END AS table_name", views["osi_tables"]) + # columns identity must align with tables identity (parse source_table) + self.assertIn("THEN SPLIT_PART(d.source_table, '.', 2)", views["osi_columns"]) + self.assertIn("END AS table_schema", views["osi_columns"]) + self.assertIn("END AS table_name", views["osi_columns"]) def test_lookml_tables_parse_simple_sql_table_name(self): views = build_context_view_sql({"lookml_view"}) @@ -76,10 +75,6 @@ def test_lookml_tables_parse_simple_sql_table_name(self): self.assertIn("REGEXP_COUNT(sql_table_name, '[.]') = 2", views["lookml_tables"]) self.assertIn("THEN SPLIT_PART(sql_table_name, '.', 1)", views["lookml_tables"]) self.assertIn("END AS table_catalog", views["lookml_tables"]) - self.assertIn("END AS table_schema", views["lookml_tables"]) - self.assertIn("THEN SPLIT_PART(sql_table_name, '.', 3)", views["lookml_tables"]) - self.assertIn("END AS table_name", views["lookml_tables"]) - self.assertIn("ELSE name", views["lookml_tables"]) def test_lookml_columns_use_same_sql_table_name_identity_as_tables(self): views = build_context_view_sql({"lookml_view", "lookml_dimension"}) @@ -87,16 +82,16 @@ def test_lookml_columns_use_same_sql_table_name_identity_as_tables(self): self.assertIn("FROM agents.lookml_dimension d", views["lookml_columns"]) self.assertIn("JOIN agents.lookml_view v ON v.name = d.view_name", views["lookml_columns"]) self.assertIn("REGEXP_COUNT(v.sql_table_name, '[.]') = 2", views["lookml_columns"]) - self.assertIn("THEN SPLIT_PART(v.sql_table_name, '.', 3)", views["lookml_columns"]) self.assertIn("ELSE v.name", views["lookml_columns"]) - def test_empty_view_has_typed_projection(self): + def test_missing_provider_tables_become_typed_empty_views(self): views = build_context_view_sql(set()) - self.assertIn("CAST(NULL AS VARCHAR) AS table_name", views["dbt_tables"]) + # no provider tables exist: provider views are empty typed projections, + # but AGENTS.TABLES still works as an information_schema passthrough self.assertIn("WHERE 1 = 0", views["dbt_tables"]) + self.assertIn("CAST(NULL AS VARCHAR) AS table_catalog", views["dbt_tables"]) self.assertIn("FROM information_schema.tables t", views["tables"]) - self.assertIn("CAST(NULL AS VARCHAR) AS entity_id", views["entities"]) if __name__ == "__main__": From 4e5b26ad741fa4315dbe29b84f5a05e2ee93a15e Mon Sep 17 00:00:00 2001 From: Dave Fowler Date: Sat, 30 May 2026 23:41:10 -0700 Subject: [PATCH 3/3] Address review nits: t.* collision guard, time-dimension precision, docs - comment that _ prefixing keeps t.* from colliding with enrichment - is_time_dimension only true for dimension_group with type time (not duration) - document name-based, case-folded identity matching in _merge_on - README: note AGENTS.TABLES/COLUMNS are enriched information_schema and are per-database (point workflows at the data's database) - proposal: reconcile Duplicate And Merge Policy with the v1 prefixed-merge model Co-Authored-By: Claude Opus 4.8 --- README.md | 9 +++++++-- proposals/agent-schema-views.md | 24 ++++++++++++++++-------- src/agents_schema/views.py | 9 ++++++++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index ac8244e..107a7e9 100644 --- a/README.md +++ b/README.md @@ -128,8 +128,13 @@ Agents Schema is the shared, queryable metadata surface for consumers that start from the warehouse and need context about data that already exists there. It is closest in spirit to `information_schema`, but extensible across many -providers. Compared with MCP servers, Agents Schema is narrower: it publishes -context inside the warehouse, while MCP servers can expose tools, actions, and +providers. In fact `AGENTS.TABLES` and `AGENTS.COLUMNS` are drop-in enriched +versions of `INFORMATION_SCHEMA.TABLES`/`COLUMNS`: the native columns plus +provider-prefixed context (`dbt_description`, `lookml_ai_context`, …). Because +`INFORMATION_SCHEMA` is per-database, these views cover the database that holds +the `AGENTS` schema — point the workflows at the database your data lives in. +Compared with MCP servers, Agents Schema is narrower: it publishes context +inside the warehouse, while MCP servers can expose tools, actions, and source-specific workflows. ### How it works diff --git a/proposals/agent-schema-views.md b/proposals/agent-schema-views.md index 39bf05a..b22e05d 100644 --- a/proposals/agent-schema-views.md +++ b/proposals/agent-schema-views.md @@ -351,14 +351,22 @@ That layer should have tests that pin: The hard part is not defining view columns; it is merging provider records. -Recommended first version: - -- do not aggressively merge objects from different providers -- emit one row per provider object -- preserve `source_provider` and `source_object_id` -- let agents decide which source to trust when duplicates exist - -Later versions can add canonicalization if Agents Schema gains stable warehouse object identifiers. +**v1 approach (implemented):** merge by object identity onto the native +`INFORMATION_SCHEMA` spine, with each provider's columns appended under a +`_` prefix. Providers therefore never collide — there is no +cross-provider "which source wins" decision, because each keeps its own +namespaced columns. Within a single provider, rows are aggregated to one row +per identity before the join so duplicate provider rows cannot multiply native +rows. + +The earlier sketch below considered the alternative of emitting one row per +provider object (a union) and letting agents pick a source. v1 chose prefixed +merge instead, since it preserves the one-row-per-object grain that makes the +views information-schema-swappable. A coalesced single `description`/`ai_context` +with a trust order remains a possible future option. + +- preserve `source_provider` and `_source_object_id` for drill-down +- later versions can add canonicalization if Agents Schema gains stable warehouse object identifiers ## Resolved Decisions (v1) diff --git a/src/agents_schema/views.py b/src/agents_schema/views.py index 7d2e9f9..23e376f 100644 --- a/src/agents_schema/views.py +++ b/src/agents_schema/views.py @@ -95,6 +95,9 @@ def _merge_view( for alias in (_provider_alias(name, suffix) for name in views) ] joins = "\n".join(_merge_join(name, _provider_alias(name, suffix), identity, merge_columns) for name in views) + # Every enrichment column is `_` prefixed, so `t.*` (the native + # spine columns) can never collide with appended columns. Keep that prefix + # if more enrichment is added later. enrichment = (",\n " + ",\n ".join(selects)) if selects else "" return f"SELECT\n t.*{enrichment}\nFROM {spine} t\n{joins}" @@ -121,6 +124,10 @@ def _merge_join(view_name: str, alias: str, identity: tuple[str, ...], merge_col def _merge_on(alias: str, column: str) -> str: + # Enrichment attaches by case-folded object name, not a guaranteed-unique + # key. A provider row with NULL table_catalog matches the spine in any + # catalog; since the spine is single-database, that is effectively + # schema+name (plus column) identity. if column == "table_catalog": return f"({alias}.{column} IS NULL OR LOWER(t.{column}) = LOWER({alias}.{column}))" return f"LOWER(t.{column}) = LOWER({alias}.{column})" @@ -281,7 +288,7 @@ def _lookml_columns_sql(existing: set[str]) -> str: d.description, d.ai_context, d.field_kind AS semantic_type, - d.field_kind = 'dimension_group' AS is_time_dimension, + d.field_kind = 'dimension_group' AND COALESCE(d.type, 'time') = 'time' AS is_time_dimension, d.sql AS expression, 'lookml' AS source_provider, d.view_name || '.' || d.field_name AS source_object_id