From 71bffe6295878211e37d61e0f3dc9b28ae1f96e0 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Tue, 16 Jun 2026 20:44:33 +0200 Subject: [PATCH 01/13] =?UTF-8?q?DEV-1566:=20pg-facade=20=E2=80=94=20CAST(?= =?UTF-8?q?=20AS=20)=20in=20projection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pattern-match CAST around a bare or qualified Column in the shared facade translator. The engine still executes the bare column; the wire OID is overridden via projection_types, made safe by a strict per-pair coercion allowlist. Flips the DEV-1562 strict-xfail test_timestamp_round_trip_both_formats. Codex round-1 follow-ups folded in: OID_TIMESTAMP + dt.date widening in value_to_text (text-format DATE→TIMESTAMP was emitting bare YYYY-MM-DD); OID_TEXT + bool now emits true/false (Postgres text shape) rather than t/f; _order_by_name and _validate_group_by gain CAST detection branches so the canonical lowercase form resolves; DATE / BOOLEAN columns added to the translator test fixture; DOUBLE→INT dropped from the allowlist (Python int(x) truncates while Postgres rounds half-to-even); unknown source data type admits TEXT only. Codex round-2 follow-ups: column_name_mapping for aliased CAST now points at the bare engine column (engine still executes it); type-alias coverage for VARCHAR/INTEGER/BIGINT/SMALLINT/FLOAT/REAL/DECIMAL/DATETIME/TIMESTAMPTZ; aliased + qualified-ref ORDER BY / GROUP BY canonical resolution; full error-contract pinning; non-aggregate non-column inner rejection. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/interfaces/pg-facade.md | 39 ++ slayer/facade/translator.py | 202 +++++++- slayer/pg_facade/types.py | 15 + tests/facade/test_translator.py | 434 ++++++++++++++++++ .../integration/test_integration_pg_facade.py | 15 + tests/integration/test_metabase_e2e.py | 4 - tests/pg_facade/test_types.py | 64 ++- 7 files changed, 763 insertions(+), 10 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index 3aff8f73..3e7c6d07 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -142,6 +142,45 @@ Postgres-specific predicates that aren't valid SLayer DSL (`ILIKE`, `::cast`, re `ANY`/`ALL`) parse but are rejected at execution — use the standard comparison / `IN` / `BETWEEN` forms. +### `CAST( AS )` in projection + +A projection of the shape `CAST( AS )` (and the equivalent `col::type` +sugar) is accepted when the inner expression is a bare or qualified column reference +**and** the (source, target) pair is in the allowlist below. The engine still executes +the bare column — the cast is a pure wire-layer type rewrite. The projected column's +Postgres OID is overridden to match the casted type. + +Common BI shapes covered: `SELECT CAST(ordered_at AS TIMESTAMP) FROM orders` (DATE +column promoted for a TIMESTAMP-aware client), `SELECT CAST(amount AS TEXT) AS s +FROM orders` (stringification), `SELECT CAST(customers.region AS TEXT) FROM orders` +(joined column). + +Out of scope: `CAST` around aggregates (`CAST(SUM(amount) AS DOUBLE)`), `TRY_CAST`, +and `CAST` around expressions that aren't a bare column (`CAST(SUBSTRING(...) AS T)`). +`CAST` wrapping a `DATE_TRUNC(...)` continues to route through the time-grain unwrap. + +Admitted (source, target) coercions: + +| Source type | Admitted target types | +|---------------|------------------------------| +| `DATE` | `DATE`, `TIMESTAMP`, `TEXT` | +| `TIMESTAMP` | `TIMESTAMP`, `DATE`, `TEXT` | +| `INT` | `INT`, `DOUBLE`, `TEXT` | +| `DOUBLE` | `DOUBLE`, `TEXT` | +| `BOOLEAN` | `BOOLEAN`, `TEXT` | +| `TEXT` | `TEXT` | +| *(unknown)* | `TEXT` | + +Pairs outside the allowlist (e.g. `CAST(name AS INT)`, `CAST(amount AS BOOLEAN)`) +raise `Unsupported CAST: cannot project column as (...). Admitted +coercions: see docs/interfaces/pg-facade.md.` Unsupported target types (`UUID`, `JSON`, +`ARRAY`, `STRUCT`, …) raise the standard `Unsupported projection expression` error. + +`DOUBLE → INT` is intentionally excluded: Python's `int()` truncates toward zero +while Postgres rounds half-to-even, so silently admitting the pair would diverge from +`psql` semantics. Pre-aggregate or pre-round on your side when an integer-typed result +is required. + ## Introspection * `INFORMATION_SCHEMA.METRICS` / `DIMENSIONS` / `SCHEMATA` / `TABLES` / `COLUMNS`. diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index c276f838..66662503 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -229,6 +229,79 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: exp.NEQ: "<>", } +# DEV-1566: sqlglot DataType.Type → SLayer DataType for CAST( AS ) +# projection support. Aliases sqlglot canonicalises at parse time (STRING→TEXT, +# INTEGER→INT, NUMERIC→DECIMAL, BOOL→BOOLEAN, FLOAT→DOUBLE) need no entry; +# REAL parses to exp.DataType.Type.FLOAT (not normalised), so DOUBLE coverage +# routes through here. Parameterised forms (VARCHAR(255), DECIMAL(10,2)) +# collapse onto the same base member at parse time — precision is dropped at +# the SLayer boundary because SLayer wire types don't carry it. +_SQLGLOT_TYPE_TO_DATATYPE: Dict[exp.DataType.Type, DataType] = { + exp.DataType.Type.TEXT: DataType.TEXT, + exp.DataType.Type.VARCHAR: DataType.TEXT, + exp.DataType.Type.CHAR: DataType.TEXT, + exp.DataType.Type.NCHAR: DataType.TEXT, + exp.DataType.Type.NVARCHAR: DataType.TEXT, + exp.DataType.Type.INT: DataType.INT, + exp.DataType.Type.BIGINT: DataType.INT, + exp.DataType.Type.SMALLINT: DataType.INT, + exp.DataType.Type.TINYINT: DataType.INT, + exp.DataType.Type.MEDIUMINT: DataType.INT, + exp.DataType.Type.DOUBLE: DataType.DOUBLE, + exp.DataType.Type.FLOAT: DataType.DOUBLE, + exp.DataType.Type.DECIMAL: DataType.DOUBLE, + exp.DataType.Type.BOOLEAN: DataType.BOOLEAN, + exp.DataType.Type.DATE: DataType.DATE, + exp.DataType.Type.TIMESTAMP: DataType.TIMESTAMP, + exp.DataType.Type.DATETIME: DataType.TIMESTAMP, + exp.DataType.Type.TIMESTAMPTZ: DataType.TIMESTAMP, + exp.DataType.Type.TIMESTAMPLTZ: DataType.TIMESTAMP, +} + +# DEV-1566: per-pair allowlist of CAST coercions admitted in projection. +# Each pair is one the wire encoders in slayer/pg_facade/types.py handle +# losslessly. The unknown-source (None) case admits ONLY TEXT — for every +# other target we'd push the failure into value_to_text / value_to_binary at +# response time, surfacing as an opaque connection error instead of a clean +# translation error. Identity (X→X) is always admitted; computed implicitly. +_CAST_ADMITTED_TARGETS: Dict[DataType, frozenset] = { + DataType.DATE: frozenset({DataType.DATE, DataType.TIMESTAMP, DataType.TEXT}), + DataType.TIMESTAMP: frozenset({DataType.TIMESTAMP, DataType.DATE, DataType.TEXT}), + DataType.INT: frozenset({DataType.INT, DataType.DOUBLE, DataType.TEXT}), + # DOUBLE → INT intentionally dropped (Python int(x) truncates toward zero; + # Postgres rounds half-to-even). Round-2 Codex review. + DataType.DOUBLE: frozenset({DataType.DOUBLE, DataType.TEXT}), + DataType.BOOLEAN: frozenset({DataType.BOOLEAN, DataType.TEXT}), + DataType.TEXT: frozenset({DataType.TEXT}), +} + + +def _sqlglot_type_to_datatype(node: exp.DataType) -> Optional[DataType]: + """Map a sqlglot ``DataType`` node to a SLayer ``DataType``. + + Returns ``None`` when the type member isn't in the mapping (UUID, JSON, + ARRAY, STRUCT, …) — caller falls through to the existing + 'Unsupported projection expression' error. + """ + return _SQLGLOT_TYPE_TO_DATATYPE.get(node.this) + + +def _is_admitted_cast(source: Optional[DataType], target: DataType) -> bool: + """Strict per-pair admission. ``None`` source admits only ``TEXT``.""" + if source is None: + return target is DataType.TEXT + if source == target: + return True + return target in _CAST_ADMITTED_TARGETS.get(source, frozenset()) + + +def _alias_for_column_cast(dotted: str, target: DataType) -> str: + """Canonical ORDER BY / GROUP BY key for an unaliased ``CAST(col AS T)`` + projection. Mirrors ``_alias_for_time_grain`` — lowercase, dotted, + unquoted, no precision — so the equality lookup in ``_translate_slayer_select`` + matches what ``_order_by_name`` / ``_validate_group_by`` produce.""" + return f"cast({dotted} as {target.value.lower()})" + def _column_to_dotted( col: exp.Column, *, strip_prefix: Optional[Tuple[str, str]] = None, @@ -735,6 +808,10 @@ class _ProjectionItem(BaseModel): dimension: Optional[FacadeDimension] = None time_grain: Optional[TimeGranularity] = None time_grain_underlying: Optional[FacadeDimension] = None + # DEV-1566: target type when the projection was CAST( AS ). + # Overrides projection_types[i] at _record_metric / _record_dimension time, + # leaving engine_alias and the SlayerQuery measure/dimension untouched. + cast_target: Optional[DataType] = None def _resolve_time_grain_projection( @@ -811,6 +888,61 @@ def _resolve_column_projection( ) +def _detect_column_cast( + body: exp.Expression, +) -> Optional[Tuple[exp.Column, DataType]]: + """If ``body`` is ``CAST( AS )``, + return ``(inner_col, target_data_type)``. Otherwise ``None``. + + The detector REQUIRES ``body.this`` to be an ``exp.Column`` — aggregates, + hygiene wrappers, function calls, and arithmetic stay outside scope. + ``exp.TryCast`` is rejected (Postgres has no native TRY_CAST equivalent). + """ + if not isinstance(body, exp.Cast) or isinstance(body, exp.TryCast): + return None + inner = body.this + if not isinstance(inner, exp.Column): + return None + target = _sqlglot_type_to_datatype(body.to) + if target is None: + return None + return inner, target + + +def _resolve_column_cast_projection( + *, + body: exp.Column, + cast_target: DataType, + alias_name: Optional[str], + table: FacadeTable, + metrics_by_name: Dict[str, FacadeMetric], + dims_by_name: Dict[str, FacadeDimension], + strip_prefix: Optional[Tuple[str, str]] = None, +) -> _ProjectionItem: + """Resolve ``CAST( AS )`` to a projection item that runs + the bare column through the engine and overrides the wire OID via + ``cast_target``. Enforces the strict per-pair coercion allowlist. + """ + inner = _resolve_column_projection( + body=body, alias_name=alias_name, table=table, + metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, + strip_prefix=strip_prefix, + ) + source: Optional[DataType] = ( + inner.metric.data_type if inner.metric is not None + else inner.dimension.data_type if inner.dimension is not None + else None + ) + if not _is_admitted_cast(source=source, target=cast_target): + offending = exp.Cast(this=body, to=exp.DataType.build(cast_target.value)).sql() + raise TranslationError( + f"Unsupported CAST: cannot project {source!s} column as " + f"{cast_target!s} ({offending!r}). Admitted coercions: see " + f"docs/interfaces/pg-facade.md." + ) + return inner.model_copy(update={"cast_target": cast_target}) + + # DEV-1558 B5: hygiene-scalar wrappers Metabase uses for fingerprint queries. # Each maps a sqlglot AST class to a human-readable name for the WARNING log. _HYGIENE_FUNC_CLASSES: Dict[type, str] = { @@ -936,6 +1068,21 @@ def _resolve_projection( )) continue + # DEV-1566: CAST( AS ) projection. Runs AFTER the + # time-grain unwrap (preserves Metabase's CAST(DATE_TRUNC(...) AS DATE)) + # and AFTER the aggregate detector (CAST(SUM(...) AS T) is out of + # scope — the detector requires the inner to be a bare Column). + cast_match = _detect_column_cast(body) + if cast_match is not None: + inner_col, cast_target = cast_match + out.append(_resolve_column_cast_projection( + body=inner_col, cast_target=cast_target, + alias_name=alias_name, table=table, + metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, + strip_prefix=strip_prefix, + )) + continue + if isinstance(body, exp.Column): out.append(_resolve_column_projection( body=body, alias_name=alias_name, table=table, @@ -1266,6 +1413,14 @@ def _order_by_name( if grain_match is not None: grain, col = grain_match return _alias_for_time_grain(grain, col, strip_prefix=strip_prefix) + # DEV-1566: CAST( AS ) ORDER BY resolves via the same + # lowercase canonical key the projection-time registration emits. + cast_match = _detect_column_cast(body) + if cast_match is not None: + inner_col, target = cast_match + return _alias_for_column_cast( + _column_to_dotted(inner_col, strip_prefix=strip_prefix), target, + ) return body.sql() @@ -1289,8 +1444,19 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e user_items.append(_alias_for_time_grain( grain, col, strip_prefix=strip_prefix, )) - else: - user_items.append(g.sql()) + continue + # DEV-1566: CAST( AS ) GROUP BY canonicalises to + # the same lowercase form _record_dimension registers in + # derived_dims, so dim-only `GROUP BY CAST(col AS T)` validates. + cast_match = _detect_column_cast(g) + if cast_match is not None: + inner_col, target = cast_match + user_items.append(_alias_for_column_cast( + _column_to_dotted(inner_col, strip_prefix=strip_prefix), + target, + )) + continue + user_items.append(g.sql()) for u in user_items: if u not in derived_set: # GROUP BY positional refs (GROUP BY 1) and aggregate terms are @@ -1469,7 +1635,8 @@ def _record_metric( }) engine_alias = f"{table.name}.{item.projected_name}" plan.column_name_mapping.append((engine_alias, item.projected_name)) - plan.projection_types.append(item.metric.data_type) + # DEV-1566: CAST( AS T) overrides the declared metric type. + plan.projection_types.append(item.cast_target or item.metric.data_type) def _record_time_grain( @@ -1504,9 +1671,18 @@ def _record_dimension( assert item.dimension is not None plan.dimension_refs.append(ColumnRef.from_string(item.dimension.dimension_ref)) plan.derived_dims.append(item.projected_name) + # DEV-1566: register the canonical CAST form as a derived dim so a + # `GROUP BY CAST(col AS T)` validates (mirrors the time-grain pattern). + if item.cast_target is not None: + canonical = _alias_for_column_cast( + item.dimension.dimension_ref, item.cast_target, + ) + if canonical != item.projected_name: + plan.derived_dims.append(canonical) engine_alias = f"{table.name}.{item.dimension.dimension_ref}" plan.column_name_mapping.append((engine_alias, item.projected_name)) - plan.projection_types.append(item.dimension.data_type) + # DEV-1566: CAST( AS T) overrides the declared dim type. + plan.projection_types.append(item.cast_target or item.dimension.data_type) def _build_projection_plan( @@ -1587,6 +1763,24 @@ def _translate_slayer_select( f"{item.time_grain.value}({item.time_grain_underlying.dimension_ref})" ) item_by_projected_name.setdefault(canonical, item) + # DEV-1566: same canonical-form registration for column CAST projections. + # Lets ``SELECT CAST(col AS T) AS user_alias ... ORDER BY CAST(col AS T)`` + # resolve via the lowercase canonical key (`cast( as )`) even + # when the projection has a different user alias. ``setdefault`` so an + # explicit projection of the canonical form (rare) still wins. + for item in items: + if item.cast_target is None: + continue + dotted = ( + item.dimension.dimension_ref if item.dimension is not None + else item.metric.name if item.metric is not None + else None + ) + if dotted is None: + continue + item_by_projected_name.setdefault( + _alias_for_column_cast(dotted, item.cast_target), item, + ) order_items = _translate_order_by( parsed.args.get("order"), item_by_projected_name, strip_prefix=strip_prefix, ) diff --git a/slayer/pg_facade/types.py b/slayer/pg_facade/types.py index c020161a..301b38f3 100644 --- a/slayer/pg_facade/types.py +++ b/slayer/pg_facade/types.py @@ -77,6 +77,21 @@ def value_to_text( # NOSONAR(S3776) — flat per-Python-type dispatch # narrowed to a date before the per-Python-type dispatch below. if oid == OID_DATE and isinstance(value, _dt.datetime): value = value.date() + # DEV-1566: symmetric widening for CAST( AS TIMESTAMP). The + # bare `dt.date` branch below emits `YYYY-MM-DD`, which pgjdbc / + # psycopg2 mis-parse for an OID_TIMESTAMP-declared column. Widen to + # `dt.datetime` so the timestamp formatter emits `YYYY-MM-DD HH:MM:SS`. + if ( + oid == OID_TIMESTAMP + and isinstance(value, _dt.date) + and not isinstance(value, _dt.datetime) + ): + value = _dt.datetime(value.year, value.month, value.day) + # DEV-1566: CAST( AS TEXT) must emit `true`/`false` (Postgres + # text shape), not the BOOL wire shape `t`/`f` the next branch would + # produce. Check OID_TEXT BEFORE the bool branch. + if oid == OID_TEXT and isinstance(value, bool): + return b"true" if value else b"false" if isinstance(value, bool): return b"t" if value else b"f" if isinstance(value, float): diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 198df88f..70707f66 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -44,6 +44,10 @@ def _catalog() -> FacadeCatalog: Column(name="revenue", type=DataType.DOUBLE), Column(name="status", type=DataType.TEXT), Column(name="ordered_at", type=DataType.TIMESTAMP), + # DEV-1566: DATE column for CAST( AS TIMESTAMP) coverage. + Column(name="delivered_at", type=DataType.DATE), + # DEV-1566: BOOLEAN column for CAST allowlist coverage. + Column(name="is_paid", type=DataType.BOOLEAN), ], measures=[ ModelMeasure(name="aov", formula="revenue:sum / *:count", @@ -751,3 +755,433 @@ def test_limit_and_offset_pass_through(dialect) -> None: assert isinstance(result, QueryResult) assert result.query.limit == 100 assert result.query.offset == 50 + + +# --- CAST( AS ) projection (DEV-1566) -------------------------- +# +# The translator admits CAST around a bare/qualified Column reference, overrides +# the wire OID via projection_types, and leaves the engine SQL projecting the +# bare column. The strict allowlist mirrors the (source, target) pairs the wire +# encoders in slayer/pg_facade/types.py can losslessly handle. + + +def test_cast_column_projection_admits_date_to_timestamp(dialect) -> None: + """Linear repro: CAST( AS TIMESTAMP) round-trips through the + translator. Engine still projects the bare column; the wire layer learns + the new OID via projection_types.""" + result = translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + # Engine sees the bare column — no CAST pushed into SlayerQuery. + assert result.query.dimensions is not None + assert [d.full_name for d in result.query.dimensions] == ["delivered_at"] + # Wire schema reflects the casted type, not the column's declared DATE. + assert result.projection_types == [DataType.TIMESTAMP] + # column_name_mapping uses the inner column's dotted form as projected_name. + assert dict(result.column_name_mapping) == { + "orders.delivered_at": "delivered_at", + } + + +def test_cast_column_with_alias_uses_alias_as_projected_name(dialect) -> None: + """The engine still selects the BARE column; only the projected_name + (user-facing label) carries the alias. So engine_alias stays + ``orders.delivered_at`` — NOT ``orders.ts`` (which would be the + aggregate-alias pattern; aggregates rename the SLayer-level measure).""" + result = translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert dict(result.column_name_mapping) == {"orders.delivered_at": "ts"} + assert result.projection_types == [DataType.TIMESTAMP] + + +def test_postgres_double_colon_cast_works() -> None: + """``col::TYPE`` sugar parses to ``exp.Cast`` under the postgres dialect — + same outcome as the keyword form.""" + result = translate( + sql="SELECT delivered_at::TIMESTAMP FROM orders", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TIMESTAMP] + + +def test_cast_joined_column_projection(dialect) -> None: + """CAST around a joined dotted column ref resolves through the same + cross-model dimension path as a bare projection.""" + result = translate( + sql="SELECT CAST(customers.region AS TEXT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.dimensions is not None + assert [d.full_name for d in result.query.dimensions] == ["customers.region"] + assert result.projection_types == [DataType.TEXT] + + +@pytest.mark.parametrize("type_name", ["UUID", "JSON", "ARRAY", "STRUCT"]) +def test_cast_unsupported_target_type_raises(type_name: str, dialect) -> None: + """Cast target types not in the SLayer DataType mapping fall through to + the existing 'Unsupported projection expression' error.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=f"SELECT CAST(revenue AS {type_name}) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert "Unsupported projection expression" in str(exc_info.value) + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("status", "INT"), # TEXT → INT + ("status", "BOOLEAN"), # TEXT → BOOLEAN + ("revenue", "BOOLEAN"), # DOUBLE → BOOLEAN + ("revenue", "INT"), # DOUBLE → INT (lossy; dropped from allowlist) + ("revenue", "DATE"), # DOUBLE → DATE + ("is_paid", "DATE"), # BOOLEAN → DATE + ("is_paid", "TIMESTAMP"), # BOOLEAN → TIMESTAMP + ("is_paid", "INT"), # BOOLEAN → INT + ("delivered_at", "INT"), # DATE → INT + ("delivered_at", "DOUBLE"),# DATE → DOUBLE + ("ordered_at", "INT"), # TIMESTAMP → INT + ("ordered_at", "DOUBLE"), # TIMESTAMP → DOUBLE + ("id", "DATE"), # INT → DATE + ("id", "BOOLEAN"), # INT → BOOLEAN + ], +) +def test_cast_rejected_coercions_raise(col: str, target: str, dialect) -> None: + """Pairs outside the §5 allowlist surface a strict, named error message.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=f"SELECT CAST({col} AS {target}) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert "Unsupported CAST" in str(exc_info.value) + + +def test_cast_rejected_error_message_pins_full_contract(dialect) -> None: + """The rejected-coercion error must name the source DataType, target + DataType, the offending SQL, and link the docs reference. Vague messages + would regress agent-debuggability.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql="SELECT CAST(status AS INT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + msg = str(exc_info.value) + assert "Unsupported CAST" in msg + assert "TEXT" in msg # source DataType + assert "INT" in msg # target DataType + assert "CAST(status AS INT)" in msg # offending SQL fragment + assert "docs/interfaces/pg-facade.md" in msg # docs pointer + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("status", "TEXT"), # TEXT → TEXT + ("revenue", "DOUBLE"), # DOUBLE → DOUBLE + ("id", "INT"), # INT → INT + ("delivered_at", "DATE"), # DATE → DATE + ("ordered_at", "TIMESTAMP"),# TIMESTAMP → TIMESTAMP + ("is_paid", "BOOLEAN"), # BOOLEAN → BOOLEAN + ], +) +def test_cast_identity_pair_admitted_for_every_type(col: str, target: str, dialect) -> None: + result = translate( + sql=f"SELECT CAST({col} AS {target}) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType(target)] + + +@pytest.mark.parametrize( + ("col", "target", "expected_type"), + [ + # Date/time pair + ("delivered_at", "TIMESTAMP", DataType.TIMESTAMP), + ("ordered_at", "DATE", DataType.DATE), + # Numeric pair (INT→DOUBLE only; DOUBLE→INT dropped) + ("id", "DOUBLE", DataType.DOUBLE), + # X → TEXT (always admitted) + ("delivered_at", "TEXT", DataType.TEXT), + ("ordered_at", "TEXT", DataType.TEXT), + ("id", "TEXT", DataType.TEXT), + ("revenue", "TEXT", DataType.TEXT), + ("is_paid", "TEXT", DataType.TEXT), + ("status", "TEXT", DataType.TEXT), + ], +) +def test_cast_admitted_coercions_parametrised( + col: str, target: str, expected_type: DataType, dialect, +) -> None: + result = translate( + sql=f"SELECT CAST({col} AS {target}) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [expected_type] + + +def test_cast_try_cast_rejected(dialect) -> None: + """TRY_CAST parses to exp.TryCast, not exp.Cast, and is explicitly out + of scope — Postgres has no native TRY_CAST.""" + with pytest.raises(TranslationError): + translate( + sql="SELECT TRY_CAST(status AS INT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + + +def test_cast_aggregate_inner_rejected(dialect) -> None: + """CAST( AS T) is explicitly out of scope (Column only).""" + with pytest.raises(TranslationError): + translate( + sql="SELECT CAST(SUM(revenue) AS DOUBLE) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + + +def test_cast_time_grain_compat_unchanged(dialect) -> None: + """CAST(DATE_TRUNC(...) AS DATE) is the time-grain pattern — body.this is + DateTrunc, not Column, so the new column-CAST branch returns None and the + existing time-grain CAST-unwrap still handles it.""" + result = translate( + sql="SELECT CAST(date_trunc('month', ordered_at) AS DATE), revenue_sum FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.time_dimensions is not None + assert result.query.time_dimensions[0].granularity == TimeGranularity.MONTH + + +def test_cast_order_by_canonical_form_resolves(dialect) -> None: + """An unaliased CAST projection ORDER-BY'd by the same CAST form resolves + to the underlying engine column via the canonical-form registration.""" + result = translate( + sql=( + "SELECT CAST(delivered_at AS TIMESTAMP) FROM orders " + "ORDER BY CAST(delivered_at AS TIMESTAMP) ASC" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.name == "delivered_at" + assert result.query.order[0].direction == "asc" + + +def test_cast_group_by_canonical_form_resolves(dialect) -> None: + """A CAST in projection + the same CAST repeated in GROUP BY (dim-only + dedupe shape) must validate, mirroring the time-grain GROUP BY canonical + registration.""" + result = translate( + sql=( + "SELECT CAST(delivered_at AS TIMESTAMP) FROM orders " + "GROUP BY CAST(delivered_at AS TIMESTAMP)" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TIMESTAMP] + + +def test_cast_unknown_source_datatype_admits_text_only(dialect) -> None: + """Custom metrics with declared data_type=None admit ONLY CAST→TEXT; + every other target is rejected so wire-encode-time crashes don't surface + as opaque connection errors. Custom aggregations carry data_type=None + when constructed without an explicit type — exercise that path.""" + orders = SlayerModel( + name="orders", + data_source="jaffle", + sql_table="orders", + columns=[ + Column(name="id", type=DataType.INT, primary_key=True), + Column(name="amount", type=DataType.DOUBLE), + ], + measures=[ + # No `type=` → ModelMeasure.type is None. + ModelMeasure(name="custom_metric", formula="amount:sum"), + ], + ) + catalog = build_catalog(models_by_datasource={"jaffle": [orders]}) + + # → TEXT admitted. + result = translate( + sql="SELECT CAST(custom_metric AS TEXT) FROM orders", + catalog=catalog, dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TEXT] + + # → TIMESTAMP rejected with the strict-allowlist message. + with pytest.raises(TranslationError) as exc_info: + translate( + sql="SELECT CAST(custom_metric AS TIMESTAMP) FROM orders", + catalog=catalog, dialect=dialect, + ) + assert "Unsupported CAST" in str(exc_info.value) + + +@pytest.mark.parametrize( + "col", ["delivered_at", "ordered_at", "id", "revenue", "is_paid", "status"], +) +def test_cast_text_target_admitted_from_every_source(col: str, dialect) -> None: + """X → TEXT is always admitted (stringification is universal).""" + result = translate( + sql=f"SELECT CAST({col} AS TEXT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TEXT] + + +@pytest.mark.parametrize( + ("type_alias", "expected"), + [ + # TEXT-family aliases. + ("VARCHAR", DataType.TEXT), + ("CHAR", DataType.TEXT), + # INT-family aliases. + ("INTEGER", DataType.INT), + ("BIGINT", DataType.INT), + ("SMALLINT", DataType.INT), + # DOUBLE-family aliases (floating/decimal collapse to DOUBLE). + ("FLOAT", DataType.DOUBLE), + ("REAL", DataType.DOUBLE), + ("DECIMAL", DataType.DOUBLE), + ("NUMERIC", DataType.DOUBLE), + # TIMESTAMP-family aliases. + ("DATETIME", DataType.TIMESTAMP), + ("TIMESTAMPTZ", DataType.TIMESTAMP), + ], +) +def test_cast_sqlglot_type_aliases_map_to_slayer_datatype( + type_alias: str, expected: DataType, +) -> None: + """Each accepted sqlglot DataType.Type alias collapses onto the canonical + SLayer DataType. Pinned under postgres dialect since some aliases + (TIMESTAMPTZ) only parse cleanly there.""" + # Pick a source column whose declared type makes an + # admitted pair: identity on the expected canonical type. + source_col = { + DataType.TEXT: "status", + DataType.INT: "id", + DataType.DOUBLE: "revenue", + DataType.TIMESTAMP: "ordered_at", + }[expected] + result = translate( + sql=f"SELECT CAST({source_col} AS {type_alias}) FROM orders", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [expected] + + +def test_cast_parameterised_type_form_works() -> None: + """sqlglot represents ``VARCHAR(255)`` / ``DECIMAL(10,2)`` etc. with their + precision modifier on the SAME ``DataType.Type`` member, so the mapping + collapses precision implicitly — SLayer wire types don't carry it.""" + result = translate( + sql="SELECT CAST(status AS VARCHAR(255)) FROM orders", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TEXT] + + +def test_cast_non_column_non_aggregate_inner_rejected(dialect) -> None: + """The CAST detector REQUIRES body.this == exp.Column. Inner expressions + that aren't bare columns (hygiene wrappers like SUBSTRING, function + calls, arithmetic) must not accidentally route through the CAST branch + — they must fall through to the existing fallback.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql="SELECT CAST(SUBSTRING(status, 1, 1) AS TEXT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + # Specifically: NOT the strict-allowlist message. + assert "Unsupported CAST" not in str(exc_info.value) + + +def test_cast_qualified_ref_order_by_canonical_resolves(dialect) -> None: + """Canonical CAST form must work for qualified (joined) refs too. + `cast(customers.region as text)` is the canonical key the registration + pushes into item_by_projected_name.""" + result = translate( + sql=( + "SELECT CAST(customers.region AS TEXT) FROM orders " + "ORDER BY CAST(customers.region AS TEXT) ASC" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.full_name == "customers.region" + assert result.query.order[0].direction == "asc" + + +def test_cast_qualified_ref_group_by_canonical_resolves(dialect) -> None: + """Same qualified-ref canonical-form coverage on the GROUP BY path.""" + result = translate( + sql=( + "SELECT CAST(customers.region AS TEXT) FROM orders " + "GROUP BY CAST(customers.region AS TEXT)" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types == [DataType.TEXT] + + +def test_cast_aliased_projection_order_by_canonical_resolves(dialect) -> None: + """The whole point of registering the canonical form via setdefault is + that ORDER BY by the CAST shape resolves even when the SELECT projection + carries a different user alias. ``SELECT CAST(x AS T) AS y ... ORDER BY + CAST(x AS T)`` must match the projection via the canonical key — not + via the alias `y`.""" + result = translate( + sql=( + "SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders " + "ORDER BY CAST(delivered_at AS TIMESTAMP) ASC" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.name == "delivered_at" + assert result.query.order[0].direction == "asc" + + +def test_cast_aliased_projection_group_by_canonical_resolves(dialect) -> None: + """Same setdefault canonical-form coverage on the GROUP BY path with + an aliased projection.""" + result = translate( + sql=( + "SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders " + "GROUP BY CAST(delivered_at AS TIMESTAMP)" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert dict(result.column_name_mapping) == {"orders.delivered_at": "ts"} + + +def test_cast_metric_projection_overrides_wire_type(dialect) -> None: + """A CAST around a metric reference resolves through the metric path + (`_record_metric`), and the cast target wins over the declared metric type.""" + result = translate( + sql="SELECT CAST(revenue_sum AS TEXT) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.measures is not None + assert result.query.measures[0].formula == "revenue:sum" + # Declared metric type is DOUBLE; CAST( AS TEXT) overrides. + assert result.projection_types == [DataType.TEXT] diff --git a/tests/integration/test_integration_pg_facade.py b/tests/integration/test_integration_pg_facade.py index afe0e189..deb1f52c 100644 --- a/tests/integration/test_integration_pg_facade.py +++ b/tests/integration/test_integration_pg_facade.py @@ -373,6 +373,21 @@ async def test_dml_rejected(pg_demo_server) -> None: await conn.close() +async def test_cast_unsupported_coercion_returns_postgres_error(pg_demo_server) -> None: + """DEV-1566: CAST( AS INT) is outside the admitted-coercion + allowlist (truncation/parse semantics differ from Postgres). The + translator surfaces a clean PostgresError with the strict-allowlist + message — not an internal connection crash at wire-encode time.""" + host, port = pg_demo_server + conn = await _connect(host, port) + try: + with pytest.raises(asyncpg.PostgresError) as exc_info: + await conn.fetch("SELECT CAST(name AS INT) FROM customers LIMIT 1") + assert "Unsupported CAST" in str(exc_info.value) + finally: + await conn.close() + + # --- parameterised query (literal substitution) ------------------------------ diff --git a/tests/integration/test_metabase_e2e.py b/tests/integration/test_metabase_e2e.py index e937c6cb..48830258 100644 --- a/tests/integration/test_metabase_e2e.py +++ b/tests/integration/test_metabase_e2e.py @@ -1124,10 +1124,6 @@ def test_date_psycopg_text_round_trip(metabase_e2e_env: MetabaseE2EEnv) -> None: conn.close() -@pytest.mark.xfail( - strict=True, - reason="DEV-1566: pg-facade rejects CAST( AS TIMESTAMP) in projection; no SLayer-query primitive for per-query type coercion", -) async def test_timestamp_round_trip_both_formats(metabase_e2e_env: MetabaseE2EEnv) -> None: host, port = metabase_e2e_env.pg_primary conn = await _asyncpg_connect(host, port, password=metabase_e2e_env.pg_primary_password) diff --git a/tests/pg_facade/test_types.py b/tests/pg_facade/test_types.py index f40bf045..7dc385aa 100644 --- a/tests/pg_facade/test_types.py +++ b/tests/pg_facade/test_types.py @@ -9,6 +9,9 @@ import pytest from slayer.core.enums import DataType +from slayer.core.models import Column, SlayerModel +from slayer.facade.catalog import build_catalog +from slayer.facade.translator import QueryResult, translate from slayer.pg_facade import types as t from slayer.pg_facade.protocol import ( OID_BOOL, @@ -42,6 +45,32 @@ def test_datatype_to_oid_none_falls_back_to_text() -> None: assert t.datatype_to_oid(None) == OID_TEXT +def test_query_result_oid_overridden_by_cast() -> None: + """DEV-1566: end-to-end — a CAST( AS TIMESTAMP) translation + yields a projection_types entry that maps to OID_TIMESTAMP, not the + column's declared OID_DATE.""" + orders = SlayerModel( + name="orders", data_source="jaffle", sql_table="orders", + columns=[ + Column(name="id", type=DataType.INT, primary_key=True), + Column(name="delivered_at", type=DataType.DATE), + ], + ) + catalog = build_catalog(models_by_datasource={"jaffle": [orders]}) + + # Baseline: bare projection → OID_DATE. + bare = translate(sql="SELECT delivered_at FROM orders", catalog=catalog) + assert isinstance(bare, QueryResult) + assert t.datatype_to_oid(bare.projection_types[0]) == OID_DATE + + # CAST projection → OID_TIMESTAMP (wire layer rewritten). + casted = translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) FROM orders", catalog=catalog, + ) + assert isinstance(casted, QueryResult) + assert t.datatype_to_oid(casted.projection_types[0]) == OID_TIMESTAMP + + # --- value_to_text ----------------------------------------------------------- @@ -50,8 +79,18 @@ def test_value_to_text_none_is_sql_null() -> None: def test_value_to_text_bool() -> None: - assert t.value_to_text(True) == b"t" - assert t.value_to_text(False) == b"f" + # DEV-1566: the default oid is OID_TEXT, which is the Postgres text shape + # for booleans (``true``/``false``). The BOOL-wire ``t``/``f`` shape is + # covered separately by test_value_to_text_bool_in_bool_column. + assert t.value_to_text(True) == b"true" + assert t.value_to_text(False) == b"false" + + +def test_value_to_text_bool_in_bool_column() -> None: + """OID_BOOL keeps the wire-format ``t``/``f`` shape — that's what the + Postgres binary protocol carries on the text-format path for BOOL.""" + assert t.value_to_text(True, OID_BOOL) == b"t" + assert t.value_to_text(False, OID_BOOL) == b"f" def test_value_to_text_int_and_decimal() -> None: @@ -102,6 +141,27 @@ def test_value_to_text_datetime_in_timestamp_column_is_preserved() -> None: assert t.value_to_text(ts, OID_TIMESTAMP) == b"2024-06-01 12:30:45" +def test_value_to_text_date_in_timestamp_column_is_widened() -> None: + """DEV-1566: symmetric widening to the existing OID_DATE-driven narrowing. + CAST( AS TIMESTAMP) needs the wire text payload to look like a + Postgres TIMESTAMP literal (``YYYY-MM-DD HH:MM:SS``) — without the widen + step pgjdbc/psycopg2 mis-parse a bare ``YYYY-MM-DD`` value for a column + declared with OID 1114.""" + d = dt.date(2024, 6, 1) + assert t.value_to_text(d, OID_TIMESTAMP) == b"2024-06-01 00:00:00" + # Default (no oid / OID_TEXT) preserves the bare date shape. + assert t.value_to_text(d) == b"2024-06-01" + + +def test_value_to_text_bool_in_text_column_uses_true_false() -> None: + """DEV-1566: CAST( AS TEXT) must surface Postgres-shaped boolean + text (``true``/``false``), not the BOOL wire shape (``t``/``f``). + Default oid == OID_TEXT so the explicit and default forms agree; + BOOL-wire ``t``/``f`` is reachable only via explicit OID_BOOL.""" + assert t.value_to_text(True, OID_TEXT) == b"true" + assert t.value_to_text(False, OID_TEXT) == b"false" + + def test_value_to_text_str_and_bytes() -> None: assert t.value_to_text("hello") == b"hello" assert t.value_to_text(b"raw") == b"raw" From 699f87bd3ee7b66662191c8acdcf669f62619ea6 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 11:47:27 +0200 Subject: [PATCH 02/13] DEV-1566 PR #185 round 1: address CodeRabbit + Sonar follow-up MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - All 4 `_alias_for_column_cast` call sites now use keyword args (CodeRabbit nit + one site it missed at line 1678) per the project's "≥2 params → kwargs" rule. - Sonar S3358 nested-conditional flags on the metric→dim→None and dim→metric→None chained ternaries are NOSONAR-suppressed with rationale inline (the fallback shape is the readable idiom). - Sonar S3776 cognitive-complexity flag on `_resolve_projection` is NOSONAR-suppressed — the body is a flat dispatch over projection-expression shapes; each new shape adds exactly one branch by design. - Sonar S3776 cognitive-complexity flag on `_translate_slayer_select` is resolved by extracting the two `setdefault` canonical-form-registration loops (time-grain + DEV-1566 CAST) into a new `_index_items_by_canonical_form(items)` helper. The body of `_translate_slayer_select` now reads as a flat pipeline. Co-Authored-By: Claude Opus 4.7 --- slayer/facade/translator.py | 78 ++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index 66662503..c2194993 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -928,7 +928,7 @@ def _resolve_column_cast_projection( metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, strip_prefix=strip_prefix, ) - source: Optional[DataType] = ( + source: Optional[DataType] = ( # NOSONAR(S3358) — metric→dim→None fallback; chained ternary is the idiomatic shape inner.metric.data_type if inner.metric is not None else inner.dimension.data_type if inner.dimension is not None else None @@ -1024,7 +1024,7 @@ def _raw_column_parts(col: exp.Column) -> List[str]: return parts -def _resolve_projection( +def _resolve_projection( # NOSONAR(S3776) — flat dispatch over projection-expression shapes; one branch per shape by design expressions: Sequence[exp.Expression], table: FacadeTable, *, schema_name: Optional[str] = None, ) -> List[_ProjectionItem]: @@ -1419,7 +1419,8 @@ def _order_by_name( if cast_match is not None: inner_col, target = cast_match return _alias_for_column_cast( - _column_to_dotted(inner_col, strip_prefix=strip_prefix), target, + dotted=_column_to_dotted(inner_col, strip_prefix=strip_prefix), + target=target, ) return body.sql() @@ -1452,8 +1453,8 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e if cast_match is not None: inner_col, target = cast_match user_items.append(_alias_for_column_cast( - _column_to_dotted(inner_col, strip_prefix=strip_prefix), - target, + dotted=_column_to_dotted(inner_col, strip_prefix=strip_prefix), + target=target, )) continue user_items.append(g.sql()) @@ -1675,7 +1676,7 @@ def _record_dimension( # `GROUP BY CAST(col AS T)` validates (mirrors the time-grain pattern). if item.cast_target is not None: canonical = _alias_for_column_cast( - item.dimension.dimension_ref, item.cast_target, + dotted=item.dimension.dimension_ref, target=item.cast_target, ) if canonical != item.projected_name: plan.derived_dims.append(canonical) @@ -1702,6 +1703,40 @@ def _build_projection_plan( return plan +def _index_items_by_canonical_form( + items: Sequence[_ProjectionItem], +) -> Dict[str, _ProjectionItem]: + """Index projection items by alias plus canonical forms (time-grain, cast). + + Time-grain items are also keyed by ``grain(col)`` so an unaliased + Metabase-style GROUP BY / ORDER BY (``ORDER BY CAST(DATE_TRUNC('month', + ordered_at) AS DATE)``) resolves against the aliased projection. + DEV-1566: column-CAST items are keyed by the same lowercase + ``cast( as )`` form so ``SELECT CAST(col AS T) AS user_alias + ... ORDER BY CAST(col AS T)`` resolves. ``setdefault`` so an explicit + projection of the canonical form (rare) still wins. + """ + by_name: Dict[str, _ProjectionItem] = {item.projected_name: item for item in items} + for item in items: + if item.time_grain is not None and item.time_grain_underlying is not None: + canonical = ( + f"{item.time_grain.value}({item.time_grain_underlying.dimension_ref})" + ) + by_name.setdefault(canonical, item) + if item.cast_target is not None: + dotted = ( # NOSONAR(S3358) — dim→metric→None fallback; chained ternary is the idiomatic shape + item.dimension.dimension_ref if item.dimension is not None + else item.metric.name if item.metric is not None + else None + ) + if dotted is not None: + by_name.setdefault( + _alias_for_column_cast(dotted=dotted, target=item.cast_target), + item, + ) + return by_name + + def _parse_int_literal(node: Optional[exp.Expression]) -> Optional[int]: """Pull an int out of ``LIMIT N`` / ``OFFSET N`` style nodes.""" if node is None or not isinstance(node.expression, exp.Literal): @@ -1751,36 +1786,7 @@ def _translate_slayer_select( parsed.args.get("having"), table, filters, strip_prefix=strip_prefix, ) - item_by_projected_name = {item.projected_name: item for item in items} - # Also index time-grain items under their canonical ``grain(col)`` form so - # an unaliased Metabase-style GROUP BY / ORDER BY (``ORDER BY CAST( - # DATE_TRUNC('month', ordered_at) AS DATE)``) resolves against the - # aliased projection (``... AS "ordered_at"``). - for item in items: - if item.time_grain is None or item.time_grain_underlying is None: - continue - canonical = ( - f"{item.time_grain.value}({item.time_grain_underlying.dimension_ref})" - ) - item_by_projected_name.setdefault(canonical, item) - # DEV-1566: same canonical-form registration for column CAST projections. - # Lets ``SELECT CAST(col AS T) AS user_alias ... ORDER BY CAST(col AS T)`` - # resolve via the lowercase canonical key (`cast( as )`) even - # when the projection has a different user alias. ``setdefault`` so an - # explicit projection of the canonical form (rare) still wins. - for item in items: - if item.cast_target is None: - continue - dotted = ( - item.dimension.dimension_ref if item.dimension is not None - else item.metric.name if item.metric is not None - else None - ) - if dotted is None: - continue - item_by_projected_name.setdefault( - _alias_for_column_cast(dotted, item.cast_target), item, - ) + item_by_projected_name = _index_items_by_canonical_form(items) order_items = _translate_order_by( parsed.args.get("order"), item_by_projected_name, strip_prefix=strip_prefix, ) From 003663df4e6dbe8a014cfed4d7f2eef946cf4faa Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 15:15:35 +0200 Subject: [PATCH 03/13] DEV-1566 PR #185 round 2: gate CAST projection to pg-facade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex flagged that DEV-1566's CAST( AS ) projection lives in the shared facade translator, so the Flight SQL facade picked it up too — but Flight materialises rows via pa.Table.from_pylist against a catalog-typed schema, which raises ArrowTypeError when the engine's Python value type doesn't match the declared Arrow type (date vs pa.timestamp, bool vs pa.utf8, etc.). The pg-facade has dedicated value coercion for the admitted pairs; Flight has none. Add an `allow_column_cast: bool = True` kwarg to `translate()` / `_translate_slayer_select` / `_resolve_projection`. When False, the CAST projection branch is skipped and the body falls through to the existing "Unsupported projection expression" terminal error. The Flight shim (`slayer/flight/translator.py`) now passes `allow_column_cast=False`. The pg-facade keeps the default. Time-grain CAST unwrap (`CAST(DATE_TRUNC(...) AS DATE)` — the Metabase fingerprint) is unaffected because it runs before the column-CAST branch in `_resolve_projection`. Tests: - tests/facade/test_translator.py: kwarg gate, default-True parity, time-grain carve-out. - tests/flight/test_translator.py (new file): parametrised rejection across the regression-prone CAST pairs (DATE→TIMESTAMP, X→TEXT for every X), plus time-grain carve-out under the Flight shim. Co-Authored-By: Claude Opus 4.7 --- slayer/facade/translator.py | 46 ++++++++++++------ slayer/flight/translator.py | 14 +++++- tests/facade/test_translator.py | 54 ++++++++++++++++++++++ tests/flight/test_translator.py | 82 +++++++++++++++++++++++++++++++++ 4 files changed, 181 insertions(+), 15 deletions(-) create mode 100644 tests/flight/test_translator.py diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index c2194993..0ba8c524 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -1027,8 +1027,17 @@ def _raw_column_parts(col: exp.Column) -> List[str]: def _resolve_projection( # NOSONAR(S3776) — flat dispatch over projection-expression shapes; one branch per shape by design expressions: Sequence[exp.Expression], table: FacadeTable, *, schema_name: Optional[str] = None, + allow_column_cast: bool = True, ) -> List[_ProjectionItem]: - """Walk the projection list, classifying each item against the table.""" + """Walk the projection list, classifying each item against the table. + + ``allow_column_cast`` gates the DEV-1566 ``CAST( AS )`` + projection branch. The pg-facade passes ``True`` (default) — its wire + encoders coerce values to the declared OID. The Flight facade passes + ``False`` because its ``pa.Table.from_pylist`` schema-binding rejects + values whose Python type doesn't match the declared Arrow type and the + facade has no value-coercion pass. + """ metrics_by_name = {m.name: m for m in table.metrics} metrics_by_formula = {m.measure_formula: m for m in table.metrics} dims_by_name = {d.name: d for d in table.dimensions} @@ -1072,16 +1081,20 @@ def _resolve_projection( # NOSONAR(S3776) — flat dispatch over projection-exp # time-grain unwrap (preserves Metabase's CAST(DATE_TRUNC(...) AS DATE)) # and AFTER the aggregate detector (CAST(SUM(...) AS T) is out of # scope — the detector requires the inner to be a bare Column). - cast_match = _detect_column_cast(body) - if cast_match is not None: - inner_col, cast_target = cast_match - out.append(_resolve_column_cast_projection( - body=inner_col, cast_target=cast_target, - alias_name=alias_name, table=table, - metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, - strip_prefix=strip_prefix, - )) - continue + # Gated by allow_column_cast so the Flight facade falls through to + # the "Unsupported projection expression" error instead of producing + # a wire schema its row materialiser can't fulfil. + if allow_column_cast: + cast_match = _detect_column_cast(body) + if cast_match is not None: + inner_col, cast_target = cast_match + out.append(_resolve_column_cast_projection( + body=inner_col, cast_target=cast_target, + alias_name=alias_name, table=table, + metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, + strip_prefix=strip_prefix, + )) + continue if isinstance(body, exp.Column): out.append(_resolve_column_projection( @@ -1527,6 +1540,7 @@ def translate( catalog_sql_executor: ( "Optional[CatalogSqlExecutorProtocol | Callable[[], CatalogSqlExecutorProtocol]]" ) = None, + allow_column_cast: bool = True, ) -> TranslatorResult: """Translate a SQL string into a TranslatorResult. @@ -1598,7 +1612,9 @@ def translate( ) # Step 5 / 6 — SLayer-table translation. - return _translate_slayer_select(parsed, catalog) + return _translate_slayer_select( + parsed, catalog, allow_column_cast=allow_column_cast, + ) # Lightweight Protocol so the translator doesn't pull catalog_sql at import @@ -1749,6 +1765,7 @@ def _parse_int_literal(node: Optional[exp.Expression]) -> Optional[int]: def _translate_slayer_select( parsed: exp.Select, catalog: FacadeCatalog, + *, allow_column_cast: bool = True, ) -> QueryResult: from_clause = parsed.args.get("from_") if from_clause is None: @@ -1770,7 +1787,10 @@ def _translate_slayer_select( (schema_name, table.name) if schema_name else None ) - items = _resolve_projection(proj_exprs, table, schema_name=schema_name) + items = _resolve_projection( + proj_exprs, table, schema_name=schema_name, + allow_column_cast=allow_column_cast, + ) plan = _build_projection_plan(items, table) _validate_group_by( diff --git a/slayer/flight/translator.py b/slayer/flight/translator.py index 2a4ca4c2..8ee4fb91 100644 --- a/slayer/flight/translator.py +++ b/slayer/flight/translator.py @@ -50,8 +50,18 @@ class InfoSchemaResult(TranslatorResult): def translate(sql: str, catalog: FacadeCatalog) -> TranslatorResult: """Translate ``sql`` for the Flight facade (``dialect=None``), converting - the shared ``RowBatch`` results into Arrow-shaped ones.""" - result = _shared_translate(sql, catalog, dialect=None) + the shared ``RowBatch`` results into Arrow-shaped ones. + + ``allow_column_cast=False`` — DEV-1566 ``CAST( AS )`` projections + are gated to the pg-facade. The Flight handler materialises rows via + ``pa.Table.from_pylist`` against a catalog-typed schema, which rejects + values whose Python type doesn't match the declared Arrow type + (e.g. ``datetime.date`` against ``pa.timestamp``); Flight has no + value-coercion pass to bridge the gap. Rejecting at translate time + surfaces a clean ``TranslationError`` instead of an opaque + ``ArrowTypeError`` at materialisation. + """ + result = _shared_translate(sql, catalog, dialect=None, allow_column_cast=False) if isinstance(result, _SharedProbeResult): return ProbeResult(table=row_batch_to_arrow(result.batch)) if isinstance(result, _SharedInfoSchemaResult): diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 70707f66..201334ac 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1185,3 +1185,57 @@ def test_cast_metric_projection_overrides_wire_type(dialect) -> None: assert result.query.measures[0].formula == "revenue:sum" # Declared metric type is DOUBLE; CAST( AS TEXT) overrides. assert result.projection_types == [DataType.TEXT] + + +# --- allow_column_cast gate (Codex round 1 — Flight regression guard) -------- +# +# DEV-1566 admits CAST( AS ) projection in the shared translator. +# The Flight facade materialises rows via pa.Table.from_pylist against a +# catalog-typed schema, which raises ArrowTypeError on values whose Python +# type doesn't match the declared Arrow type (date vs timestamp, bool vs +# utf8, etc.). The Flight shim passes allow_column_cast=False to reject the +# new projection shape at translate time; this test pins the gate. + + +def test_allow_column_cast_false_rejects_cast_projection() -> None: + """With allow_column_cast=False, the CAST projection branch is skipped + and the body falls through to the 'Unsupported projection expression' + error (the existing terminal path).""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) FROM orders", + catalog=_catalog(), dialect=None, allow_column_cast=False, + ) + assert "Unsupported projection expression" in str(exc_info.value) + + +def test_allow_column_cast_false_leaves_time_grain_cast_unwrap_working() -> None: + """The gate must not regress the time-grain CAST-unwrap path — the + Metabase fingerprint ``CAST(DATE_TRUNC(...) AS DATE)`` is detected by + ``_detect_time_grain``, which runs BEFORE the column-CAST branch.""" + result = translate( + sql=( + "SELECT CAST(date_trunc('month', ordered_at) AS DATE), revenue_sum " + "FROM orders" + ), + catalog=_catalog(), dialect=None, allow_column_cast=False, + ) + assert isinstance(result, QueryResult) + assert result.query.time_dimensions is not None + assert result.query.time_dimensions[0].granularity == TimeGranularity.MONTH + + +def test_allow_column_cast_default_true_unchanged(dialect) -> None: + """Sanity: the default-True path is the same as not passing the kwarg + (pg-facade behaviour). Pinned so the default never silently flips.""" + explicit = translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) FROM orders", + catalog=_catalog(), dialect=dialect, allow_column_cast=True, + ) + implicit = translate( + sql="SELECT CAST(delivered_at AS TIMESTAMP) FROM orders", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(explicit, QueryResult) + assert isinstance(implicit, QueryResult) + assert explicit.projection_types == implicit.projection_types == [DataType.TIMESTAMP] diff --git a/tests/flight/test_translator.py b/tests/flight/test_translator.py new file mode 100644 index 00000000..0a5fd142 --- /dev/null +++ b/tests/flight/test_translator.py @@ -0,0 +1,82 @@ +"""Tests for slayer.flight.translator — Flight-shim-specific behaviour. + +Most translator coverage lives in tests/facade/test_translator.py (the shared +SQL → SlayerQuery pipeline). This file pins behaviours that are specific to +the Flight shim's `_shared_translate(..., allow_column_cast=False)` call: + +* DEV-1566 ``CAST( AS )`` projection is rejected at translate + time (Codex round 1 — Flight has no value-coercion pass and would crash + inside ``pa.Table.from_pylist`` if the projection were admitted). +* The time-grain ``CAST(DATE_TRUNC(...) AS DATE)`` Metabase fingerprint is + unaffected — the time-grain unwrap runs before the column-CAST branch. +""" + +from __future__ import annotations + +import pytest + +from slayer.core.enums import DataType +from slayer.core.models import Column, SlayerModel +from slayer.facade.catalog import FacadeCatalog, build_catalog +from slayer.flight.translator import QueryResult, TranslationError, translate + + +def _catalog() -> FacadeCatalog: + orders = SlayerModel( + name="orders", + data_source="jaffle", + sql_table="orders", + columns=[ + Column(name="id", type=DataType.INT, primary_key=True), + Column(name="revenue", type=DataType.DOUBLE), + Column(name="status", type=DataType.TEXT), + Column(name="ordered_at", type=DataType.TIMESTAMP), + Column(name="delivered_at", type=DataType.DATE), + Column(name="is_paid", type=DataType.BOOLEAN), + ], + ) + return build_catalog(models_by_datasource={"jaffle": [orders]}) + + +# --- DEV-1566 gate: Flight rejects CAST( AS ) projections --------- + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("delivered_at", "TIMESTAMP"), # DATE → TIMESTAMP (date can't fill pa.timestamp) + ("is_paid", "TEXT"), # BOOLEAN → TEXT (bool can't fill pa.utf8) + ("revenue", "TEXT"), # DOUBLE → TEXT + ("id", "TEXT"), # INT → TEXT + ("delivered_at", "TEXT"), # DATE → TEXT + ("ordered_at", "TEXT"), # TIMESTAMP → TEXT + ], +) +def test_flight_rejects_cast_projection(col: str, target: str) -> None: + """The Flight shim sets allow_column_cast=False so the CAST projection + branch is skipped and the body falls through to the existing + 'Unsupported projection expression' terminal error. Without this gate + pa.Table.from_pylist would raise ArrowTypeError at materialisation.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=f"SELECT CAST({col} AS {target}) FROM orders", + catalog=_catalog(), + ) + assert "Unsupported projection expression" in str(exc_info.value) + + +def test_flight_admits_time_grain_cast_unwrap() -> None: + """``CAST(DATE_TRUNC(, ) AS DATE)`` is the Metabase time-grain + fingerprint, NOT a column-CAST projection. The time-grain unwrap runs + BEFORE the column-CAST branch in _resolve_projection, so the gate must + not regress it. (This is the pattern Flight clients actually emit.)""" + result = translate( + sql=( + "SELECT CAST(date_trunc('month', ordered_at) AS DATE), revenue_sum " + "FROM orders" + ), + catalog=_catalog(), + ) + assert isinstance(result, QueryResult) + assert result.query.time_dimensions is not None + assert result.query.time_dimensions[0].granularity.value == "month" From 15bf15f8357172bc3fdd5a2c28740e1c7f6c42b7 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 15:45:59 +0200 Subject: [PATCH 04/13] DEV-1566 PR #185 round 3: drop CAST canonical-form in ORDER BY / GROUP BY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 2 flagged that admitting CAST( AS ) in ORDER BY and GROUP BY gives wrong semantics: - ``ORDER BY CAST(id AS TEXT)`` sorts numerically (engine projects the bare INT column) instead of lex. - ``GROUP BY CAST(ts AS DATE)`` groups per-timestamp instead of per-date (engine groups by the bare TIMESTAMP column). The CAST is documented as a pure wire-layer type rewrite — the engine query is unchanged — so ORDER BY / GROUP BY can't honour the casted semantics without pushing CAST into the engine SQL (out of scope for DEV-1566). Remove the canonical-form registrations so these queries surface a clean "ORDER BY column ... is not in the projection list" / GROUP BY strict-on- extras error. Removed: - `_alias_for_column_cast` (no longer reachable) - `_order_by_name` cast branch - `_validate_group_by` cast branch - `_record_dimension` cast canonical-form push into `derived_dims` - `_index_items_by_canonical_form` cast canonical-form push into the item index Workaround: alias the CAST projection and reference the alias in ORDER BY / GROUP BY (``SELECT CAST(c AS T) AS x ... ORDER BY x``). Wire-type override is preserved; only the silent-semantics-mismatch path is rejected. Tests: converted the four prior canonical-form-resolves tests into rejection tests, kept the aliased-projection workaround test (now exercises the explicit alias path). Added doc paragraph in docs/interfaces/pg-facade.md. Co-Authored-By: Claude Opus 4.7 --- docs/interfaces/pg-facade.md | 24 +++++ slayer/facade/translator.py | 64 +++----------- tests/facade/test_translator.py | 151 ++++++++++++++++++-------------- 3 files changed, 123 insertions(+), 116 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index 3e7c6d07..1dbec490 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -159,6 +159,30 @@ Out of scope: `CAST` around aggregates (`CAST(SUM(amount) AS DOUBLE)`), `TRY_CAS and `CAST` around expressions that aren't a bare column (`CAST(SUBSTRING(...) AS T)`). `CAST` wrapping a `DATE_TRUNC(...)` continues to route through the time-grain unwrap. +`CAST(...)` in `ORDER BY` and `GROUP BY` is **not** admitted. The engine still +projects the bare column, so a `ORDER BY CAST(id AS TEXT)` would sort numerically +(by the underlying `INT`) instead of lex; a `GROUP BY CAST(ts AS DATE)` would +group per-timestamp instead of per-date. Both are silently wrong, so the +translator raises `ORDER BY column '...' is not in the projection list` / +the GROUP BY strict-on-extras error. Use the alias workaround: + +```sql +-- Rejected: +SELECT CAST(delivered_at AS TIMESTAMP) FROM orders +ORDER BY CAST(delivered_at AS TIMESTAMP); + +-- Works: +SELECT CAST(delivered_at AS TIMESTAMP) AS dt FROM orders +ORDER BY dt; +``` + +The wire-type override still applies — `dt` is wire-typed `TIMESTAMP` — but the +sort runs against the engine column's natural type (`DATE` here, which is +order-preserving so the result is correct). For genuinely lossy semantics +(e.g. lex sort by `TEXT`-cast `INT`) the alias path makes the limitation +explicit at the SQL level. A future ticket can lift this restriction by +pushing the CAST into the engine SQL. + Admitted (source, target) coercions: | Source type | Admitted target types | diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index 0ba8c524..a1e1ade2 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -295,14 +295,6 @@ def _is_admitted_cast(source: Optional[DataType], target: DataType) -> bool: return target in _CAST_ADMITTED_TARGETS.get(source, frozenset()) -def _alias_for_column_cast(dotted: str, target: DataType) -> str: - """Canonical ORDER BY / GROUP BY key for an unaliased ``CAST(col AS T)`` - projection. Mirrors ``_alias_for_time_grain`` — lowercase, dotted, - unquoted, no precision — so the equality lookup in ``_translate_slayer_select`` - matches what ``_order_by_name`` / ``_validate_group_by`` produce.""" - return f"cast({dotted} as {target.value.lower()})" - - def _column_to_dotted( col: exp.Column, *, strip_prefix: Optional[Tuple[str, str]] = None, ) -> str: @@ -1426,15 +1418,6 @@ def _order_by_name( if grain_match is not None: grain, col = grain_match return _alias_for_time_grain(grain, col, strip_prefix=strip_prefix) - # DEV-1566: CAST( AS ) ORDER BY resolves via the same - # lowercase canonical key the projection-time registration emits. - cast_match = _detect_column_cast(body) - if cast_match is not None: - inner_col, target = cast_match - return _alias_for_column_cast( - dotted=_column_to_dotted(inner_col, strip_prefix=strip_prefix), - target=target, - ) return body.sql() @@ -1459,17 +1442,6 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e grain, col, strip_prefix=strip_prefix, )) continue - # DEV-1566: CAST( AS ) GROUP BY canonicalises to - # the same lowercase form _record_dimension registers in - # derived_dims, so dim-only `GROUP BY CAST(col AS T)` validates. - cast_match = _detect_column_cast(g) - if cast_match is not None: - inner_col, target = cast_match - user_items.append(_alias_for_column_cast( - dotted=_column_to_dotted(inner_col, strip_prefix=strip_prefix), - target=target, - )) - continue user_items.append(g.sql()) for u in user_items: if u not in derived_set: @@ -1688,14 +1660,6 @@ def _record_dimension( assert item.dimension is not None plan.dimension_refs.append(ColumnRef.from_string(item.dimension.dimension_ref)) plan.derived_dims.append(item.projected_name) - # DEV-1566: register the canonical CAST form as a derived dim so a - # `GROUP BY CAST(col AS T)` validates (mirrors the time-grain pattern). - if item.cast_target is not None: - canonical = _alias_for_column_cast( - dotted=item.dimension.dimension_ref, target=item.cast_target, - ) - if canonical != item.projected_name: - plan.derived_dims.append(canonical) engine_alias = f"{table.name}.{item.dimension.dimension_ref}" plan.column_name_mapping.append((engine_alias, item.projected_name)) # DEV-1566: CAST( AS T) overrides the declared dim type. @@ -1722,15 +1686,22 @@ def _build_projection_plan( def _index_items_by_canonical_form( items: Sequence[_ProjectionItem], ) -> Dict[str, _ProjectionItem]: - """Index projection items by alias plus canonical forms (time-grain, cast). + """Index projection items by alias plus canonical forms (time-grain only). Time-grain items are also keyed by ``grain(col)`` so an unaliased Metabase-style GROUP BY / ORDER BY (``ORDER BY CAST(DATE_TRUNC('month', ordered_at) AS DATE)``) resolves against the aliased projection. - DEV-1566: column-CAST items are keyed by the same lowercase - ``cast( as )`` form so ``SELECT CAST(col AS T) AS user_alias - ... ORDER BY CAST(col AS T)`` resolves. ``setdefault`` so an explicit - projection of the canonical form (rare) still wins. + + Column-CAST items are intentionally NOT registered here: DEV-1566's CAST + projection is a wire-type override that leaves the engine query + projecting the bare column, so ``ORDER BY CAST(col AS T)`` would sort by + the engine column's natural type (numeric/temporal) instead of the + casted type's semantics (e.g. lex for TEXT), and + ``GROUP BY CAST(ts AS DATE)`` would group per-timestamp instead of + per-date. Both are silently wrong. We reject these queries with the + existing "ORDER BY column not in projection list" / GROUP BY + strict-on-extras error and the user aliases the CAST projection to + reference it (``SELECT CAST(c AS T) AS x ... ORDER BY x``). """ by_name: Dict[str, _ProjectionItem] = {item.projected_name: item for item in items} for item in items: @@ -1739,17 +1710,6 @@ def _index_items_by_canonical_form( f"{item.time_grain.value}({item.time_grain_underlying.dimension_ref})" ) by_name.setdefault(canonical, item) - if item.cast_target is not None: - dotted = ( # NOSONAR(S3358) — dim→metric→None fallback; chained ternary is the idiomatic shape - item.dimension.dimension_ref if item.dimension is not None - else item.metric.name if item.metric is not None - else None - ) - if dotted is not None: - by_name.setdefault( - _alias_for_column_cast(dotted=dotted, target=item.cast_target), - item, - ) return by_name diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 201334ac..826b4a8d 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -961,13 +961,52 @@ def test_cast_time_grain_compat_unchanged(dialect) -> None: assert result.query.time_dimensions[0].granularity == TimeGranularity.MONTH -def test_cast_order_by_canonical_form_resolves(dialect) -> None: - """An unaliased CAST projection ORDER-BY'd by the same CAST form resolves - to the underlying engine column via the canonical-form registration.""" +def test_cast_order_by_unaliased_rejected(dialect) -> None: + """Codex round 2: ORDER BY CAST( AS ) is rejected because the + engine query still projects the bare column — sorting by the bare column + follows the engine column's natural type (numeric for INT, temporal for + TIMESTAMP) instead of the casted type's semantics (lex for TEXT), which + is silently wrong. Workaround: alias the CAST projection and ORDER BY + the alias (see test_cast_order_by_via_alias_works).""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=( + "SELECT CAST(id AS TEXT) FROM orders " + "ORDER BY CAST(id AS TEXT) ASC" + ), + catalog=_catalog(), dialect=dialect, + ) + assert "ORDER BY" in str(exc_info.value) + assert "not in the projection list" in str(exc_info.value) + + +def test_cast_group_by_unaliased_rejected(dialect) -> None: + """Codex round 2: GROUP BY CAST( AS ) is rejected. For lossy + pairs like TIMESTAMP→DATE the engine SQL still groups by the bare + column, so multiple timestamps on the same date produce duplicate + rows in the client (silently wrong).""" + with pytest.raises(TranslationError): + translate( + sql=( + "SELECT CAST(ordered_at AS DATE) FROM orders " + "GROUP BY CAST(ordered_at AS DATE)" + ), + catalog=_catalog(), dialect=dialect, + ) + + +def test_cast_order_by_via_alias_works(dialect) -> None: + """The documented workaround: alias the CAST projection and ORDER BY + the alias. The alias resolves to the projection item directly, no + canonical-form registration needed. Sort order still follows the engine + column's natural type — DATE→TIMESTAMP is 1:1 so the result is correct, + but ``SELECT CAST(id AS TEXT) AS x ... ORDER BY x`` would still sort + numerically. The alias path makes the limitation explicit at the SQL + level instead of hiding it behind a canonical-form rewrite.""" result = translate( sql=( - "SELECT CAST(delivered_at AS TIMESTAMP) FROM orders " - "ORDER BY CAST(delivered_at AS TIMESTAMP) ASC" + "SELECT CAST(delivered_at AS TIMESTAMP) AS dt FROM orders " + "ORDER BY dt ASC" ), catalog=_catalog(), dialect=dialect, ) @@ -975,20 +1014,7 @@ def test_cast_order_by_canonical_form_resolves(dialect) -> None: assert result.query.order is not None assert result.query.order[0].column.name == "delivered_at" assert result.query.order[0].direction == "asc" - - -def test_cast_group_by_canonical_form_resolves(dialect) -> None: - """A CAST in projection + the same CAST repeated in GROUP BY (dim-only - dedupe shape) must validate, mirroring the time-grain GROUP BY canonical - registration.""" - result = translate( - sql=( - "SELECT CAST(delivered_at AS TIMESTAMP) FROM orders " - "GROUP BY CAST(delivered_at AS TIMESTAMP)" - ), - catalog=_catalog(), dialect=dialect, - ) - assert isinstance(result, QueryResult) + # Wire schema still reflects the cast. assert result.projection_types == [DataType.TIMESTAMP] @@ -1110,46 +1136,42 @@ def test_cast_non_column_non_aggregate_inner_rejected(dialect) -> None: assert "Unsupported CAST" not in str(exc_info.value) -def test_cast_qualified_ref_order_by_canonical_resolves(dialect) -> None: - """Canonical CAST form must work for qualified (joined) refs too. - `cast(customers.region as text)` is the canonical key the registration - pushes into item_by_projected_name.""" - result = translate( - sql=( - "SELECT CAST(customers.region AS TEXT) FROM orders " - "ORDER BY CAST(customers.region AS TEXT) ASC" - ), - catalog=_catalog(), dialect=dialect, - ) - assert isinstance(result, QueryResult) - assert result.query.order is not None - assert result.query.order[0].column.full_name == "customers.region" - assert result.query.order[0].direction == "asc" +def test_cast_qualified_ref_order_by_unaliased_rejected(dialect) -> None: + """Codex round 2: same rejection for qualified (joined) CAST refs in + ORDER BY. Engine still selects the bare ``customers.region``; lex sort + semantics don't carry through. User must alias and ORDER BY the alias.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=( + "SELECT CAST(customers.region AS TEXT) FROM orders " + "ORDER BY CAST(customers.region AS TEXT) ASC" + ), + catalog=_catalog(), dialect=dialect, + ) + assert "not in the projection list" in str(exc_info.value) -def test_cast_qualified_ref_group_by_canonical_resolves(dialect) -> None: - """Same qualified-ref canonical-form coverage on the GROUP BY path.""" - result = translate( - sql=( - "SELECT CAST(customers.region AS TEXT) FROM orders " - "GROUP BY CAST(customers.region AS TEXT)" - ), - catalog=_catalog(), dialect=dialect, - ) - assert isinstance(result, QueryResult) - assert result.projection_types == [DataType.TEXT] +def test_cast_qualified_ref_group_by_unaliased_rejected(dialect) -> None: + """Codex round 2: same rejection for qualified-ref GROUP BY CAST.""" + with pytest.raises(TranslationError): + translate( + sql=( + "SELECT CAST(customers.region AS TEXT) FROM orders " + "GROUP BY CAST(customers.region AS TEXT)" + ), + catalog=_catalog(), dialect=dialect, + ) -def test_cast_aliased_projection_order_by_canonical_resolves(dialect) -> None: - """The whole point of registering the canonical form via setdefault is - that ORDER BY by the CAST shape resolves even when the SELECT projection - carries a different user alias. ``SELECT CAST(x AS T) AS y ... ORDER BY - CAST(x AS T)`` must match the projection via the canonical key — not - via the alias `y`.""" +def test_cast_aliased_projection_order_by_via_alias_works(dialect) -> None: + """Workaround: with an aliased CAST projection, ORDER BY + resolves directly through item_by_projected_name. The ORDER BY CAST(...) + canonical form was removed (Codex round 2) — agents must reference the + alias instead.""" result = translate( sql=( "SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders " - "ORDER BY CAST(delivered_at AS TIMESTAMP) ASC" + "ORDER BY ts ASC" ), catalog=_catalog(), dialect=dialect, ) @@ -1159,18 +1181,19 @@ def test_cast_aliased_projection_order_by_canonical_resolves(dialect) -> None: assert result.query.order[0].direction == "asc" -def test_cast_aliased_projection_group_by_canonical_resolves(dialect) -> None: - """Same setdefault canonical-form coverage on the GROUP BY path with - an aliased projection.""" - result = translate( - sql=( - "SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders " - "GROUP BY CAST(delivered_at AS TIMESTAMP)" - ), - catalog=_catalog(), dialect=dialect, - ) - assert isinstance(result, QueryResult) - assert dict(result.column_name_mapping) == {"orders.delivered_at": "ts"} +def test_cast_aliased_projection_group_by_unaliased_rejected(dialect) -> None: + """An aliased CAST projection + GROUP BY repeating the CAST shape is + still rejected — the canonical form is not in derived_dims after Codex + round 2's removal. Agents must rewrite to ``GROUP BY `` or the + bare column.""" + with pytest.raises(TranslationError): + translate( + sql=( + "SELECT CAST(delivered_at AS TIMESTAMP) AS ts FROM orders " + "GROUP BY CAST(delivered_at AS TIMESTAMP)" + ), + catalog=_catalog(), dialect=dialect, + ) def test_cast_metric_projection_overrides_wire_type(dialect) -> None: From d05c2d4f7c145237cdda1deafeb2ea9dc01a0e05 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 16:49:31 +0200 Subject: [PATCH 05/13] DEV-1566 PR #185 round 4: per-pair lossy CAST rejection in alias path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 3 follow-up: round 3's alias workaround (SELECT CAST(c AS T) AS x ... ORDER BY x) carries the same semantic gap the canonical-form removal was meant to close — sort/group still runs against the bare engine column. Add two per-pair lossy sets: _LOSSY_ORDER_BY_CAST_PAIRS — every X -> TEXT (lex sort != engine natural) _LOSSY_GROUP_BY_CAST_PAIRS — TIMESTAMP -> DATE (only many-to-one admitted) In _translate_order_by, after the projection-item lookup, raise a specific "ORDER BY on CAST projection ... with lossy pair X->T is unsupported" TranslationError when the pair is in the lossy ORDER BY set. _validate_group_by gains an item_by_projected_name parameter so it can apply the symmetric GROUP BY check after the strict-on-extras membership test. The caller in _translate_slayer_select is reordered so _index_items_by_canonical_form runs before _validate_group_by. Safe pairs (identity, INT->DOUBLE, DATE<->TIMESTAMP) keep the alias path admitted because the engine column's natural sort/group matches the casted semantics. Tests: - tests/facade/test_translator.py: - parametrised X -> TEXT rejection for ORDER BY alias path (5 pairs); - TIMESTAMP -> DATE rejection for GROUP BY alias path; - parametrised safe-pair admission for both ORDER BY and GROUP BY (DATE -> TIMESTAMP, TIMESTAMP -> DATE for ORDER BY only, INT -> DOUBLE, identity). Docs: - docs/interfaces/pg-facade.md: replaced the round-3 "documented limitation" paragraph with the per-path / per-pair admission table. Co-Authored-By: Claude Opus 4.7 --- docs/interfaces/pg-facade.md | 47 +++++++++---- slayer/facade/translator.py | 90 ++++++++++++++++++++++++- tests/facade/test_translator.py | 113 ++++++++++++++++++++++++++++++++ 3 files changed, 233 insertions(+), 17 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index 1dbec490..e8de6e66 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -159,29 +159,48 @@ Out of scope: `CAST` around aggregates (`CAST(SUM(amount) AS DOUBLE)`), `TRY_CAS and `CAST` around expressions that aren't a bare column (`CAST(SUBSTRING(...) AS T)`). `CAST` wrapping a `DATE_TRUNC(...)` continues to route through the time-grain unwrap. -`CAST(...)` in `ORDER BY` and `GROUP BY` is **not** admitted. The engine still -projects the bare column, so a `ORDER BY CAST(id AS TEXT)` would sort numerically -(by the underlying `INT`) instead of lex; a `GROUP BY CAST(ts AS DATE)` would -group per-timestamp instead of per-date. Both are silently wrong, so the -translator raises `ORDER BY column '...' is not in the projection list` / -the GROUP BY strict-on-extras error. Use the alias workaround: +`CAST(...)` in `ORDER BY` and `GROUP BY` has two layers of admission: + +1. **Unaliased canonical-form** (e.g. `ORDER BY CAST(c AS T)` repeating the + projection's CAST verbatim): **never admitted.** The translator raises + `ORDER BY column '...' is not in the projection list` / the GROUP BY + strict-on-extras error. Workaround: alias and reference the alias. +2. **Aliased reference** (`SELECT CAST(c AS T) AS x ... ORDER BY x` / + `... GROUP BY x`): admitted **only** when the `(source, target)` pair + preserves sort/group semantics under the bare-column engine projection. + +Pairs that **fail** the aliased-reference admission and raise +`ORDER BY on CAST projection '...' with lossy pair X→T is unsupported` +(symmetric message for GROUP BY): + +| Path | Lossy pairs | +|----------|-----------------------------------------------------------------| +| ORDER BY | `X → TEXT` for every `X` (lex sort ≠ engine's natural sort) | +| GROUP BY | `TIMESTAMP → DATE` (many-to-one rollup) | + +Every other admitted pair — identity (`X → X`), `DATE → TIMESTAMP`, +`TIMESTAMP → DATE` for ORDER BY, `INT → DOUBLE` — preserves the casted +semantics under the bare-column engine projection, so the alias path stays +open. ```sql --- Rejected: +-- Always rejected (canonical form): SELECT CAST(delivered_at AS TIMESTAMP) FROM orders ORDER BY CAST(delivered_at AS TIMESTAMP); --- Works: +-- Aliased reference, safe pair → works: SELECT CAST(delivered_at AS TIMESTAMP) AS dt FROM orders ORDER BY dt; + +-- Aliased reference, lossy pair → rejected: +SELECT CAST(id AS TEXT) AS s FROM orders ORDER BY s; +SELECT CAST(ordered_at AS DATE) AS d, COUNT(*) FROM orders GROUP BY d; ``` -The wire-type override still applies — `dt` is wire-typed `TIMESTAMP` — but the -sort runs against the engine column's natural type (`DATE` here, which is -order-preserving so the result is correct). For genuinely lossy semantics -(e.g. lex sort by `TEXT`-cast `INT`) the alias path makes the limitation -explicit at the SQL level. A future ticket can lift this restriction by -pushing the CAST into the engine SQL. +The wire-type override still applies in the safe-pair case — `dt` is +wire-typed `TIMESTAMP` even though the engine sorts the underlying `DATE`. +A future ticket can lift the remaining restrictions by pushing the CAST +into the engine SQL. Admitted (source, target) coercions: diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index a1e1ade2..c24bd0db 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -275,6 +275,30 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: DataType.TEXT: frozenset({DataType.TEXT}), } +# DEV-1566 — Codex round 3: pairs whose ORDER BY / GROUP BY semantics under +# Postgres differ from what the bare-column engine projection would produce. +# A query that references a CAST-projected item (via alias or canonical form) +# in ORDER BY / GROUP BY is rejected when the pair is in the corresponding +# set — workaround: order/group by the bare column, or wait for a follow-up +# ticket that pushes CAST into the engine SQL. +# +# ORDER BY lossy: every X→TEXT pair (lex sort ≠ engine's natural sort). +# Identity X→X, INT→DOUBLE, DATE↔TIMESTAMP all preserve relative order so +# they stay admitted. +_LOSSY_ORDER_BY_CAST_PAIRS: frozenset = frozenset({ + (DataType.INT, DataType.TEXT), + (DataType.DOUBLE, DataType.TEXT), + (DataType.BOOLEAN, DataType.TEXT), + (DataType.DATE, DataType.TEXT), + (DataType.TIMESTAMP, DataType.TEXT), +}) +# GROUP BY lossy: only the many-to-one TIMESTAMP→DATE rollup. Every other +# admitted pair is a 1:1 / identity mapping, so per-engine-column grouping +# already collapses to the same set of casted groups. +_LOSSY_GROUP_BY_CAST_PAIRS: frozenset = frozenset({ + (DataType.TIMESTAMP, DataType.DATE), +}) + def _sqlglot_type_to_datatype(node: exp.DataType) -> Optional[DataType]: """Map a sqlglot ``DataType`` node to a SLayer ``DataType``. @@ -1368,6 +1392,34 @@ def _flip_comparator(op_sql: str) -> str: # --- ORDER BY / GROUP BY ----------------------------------------------------- +def _item_cast_source_type(item: _ProjectionItem) -> Optional[DataType]: + """Return the underlying source DataType of a CAST-projected item, or + None when the item isn't a CAST projection. Used to look up the + (source, target) pair against the lossy-CAST sets.""" + if item.cast_target is None: + return None + if item.metric is not None: + return item.metric.data_type + if item.dimension is not None: + return item.dimension.data_type + return None + + +def _lossy_cast_error_message( + *, kind: str, name: str, source: DataType, target: DataType, +) -> str: + """One-line user-facing message for the lossy-CAST-pair rejection. The + ``kind`` is ``ORDER BY`` or ``GROUP BY``; ``name`` is the user-visible + identifier the message points at (typically the alias).""" + return ( + f"{kind} on CAST projection {name!r} with lossy pair " + f"{source!s}→{target!s} is unsupported: the engine query projects " + f"the bare column, so the underlying sort/group semantics don't " + f"match the casted type. Use the bare column, or wait for engine-" + f"side CAST pushdown." + ) + + def _translate_order_by( order: Optional[exp.Order], item_by_projected_name: Dict[str, _ProjectionItem], @@ -1387,6 +1439,16 @@ def _translate_order_by( f"ORDER BY column {name!r} is not in the projection list" ) item = item_by_projected_name[name] + # DEV-1566 — Codex round 3: reject ORDER BY on CAST projections whose + # (source, target) pair is lossy under bare-column sort semantics. + source = _item_cast_source_type(item) + if source is not None and item.cast_target is not None and ( + (source, item.cast_target) in _LOSSY_ORDER_BY_CAST_PAIRS + ): + raise TranslationError(_lossy_cast_error_message( + kind="ORDER BY", name=name, + source=source, target=item.cast_target, + )) if item.metric is not None: ref = ColumnRef(name=item.metric.name) else: @@ -1424,9 +1486,17 @@ def _order_by_name( def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; extraction adds indirection group: Optional[exp.Group], derived: List[str], + item_by_projected_name: Dict[str, _ProjectionItem], *, strip_prefix: Optional[Tuple[str, str]] = None, ) -> None: - """Apply the strict-on-extras / lenient-on-omissions policy (§6.1).""" + """Apply the strict-on-extras / lenient-on-omissions policy (§6.1). + + ``item_by_projected_name`` is consulted only to look up the projection + item behind a user-facing name so the DEV-1566 lossy-CAST-pair rejection + can fire. Membership validation still runs against the ``derived`` list + (which carries time-grain canonical forms in addition to the projected + names) to preserve the existing strict-on-extras behaviour. + """ if group is None: return derived_set = set(derived) @@ -1454,6 +1524,19 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e f"GROUP BY item {u!r} is not in the projection's derived " f"dimension set ({sorted(derived_set)})" ) + # DEV-1566 — Codex round 3: reject GROUP BY on CAST projections + # whose (source, target) pair is lossy under bare-column grouping. + item = item_by_projected_name.get(u) + if item is None: + continue + source = _item_cast_source_type(item) + if source is not None and item.cast_target is not None and ( + (source, item.cast_target) in _LOSSY_GROUP_BY_CAST_PAIRS + ): + raise TranslationError(_lossy_cast_error_message( + kind="GROUP BY", name=u, + source=source, target=item.cast_target, + )) def _is_ignorable_group_item(item: str) -> bool: @@ -1752,9 +1835,11 @@ def _translate_slayer_select( allow_column_cast=allow_column_cast, ) plan = _build_projection_plan(items, table) + item_by_projected_name = _index_items_by_canonical_form(items) _validate_group_by( - parsed.args.get("group"), plan.derived_dims, strip_prefix=strip_prefix, + parsed.args.get("group"), plan.derived_dims, item_by_projected_name, + strip_prefix=strip_prefix, ) filters: List[str] = [] @@ -1766,7 +1851,6 @@ def _translate_slayer_select( parsed.args.get("having"), table, filters, strip_prefix=strip_prefix, ) - item_by_projected_name = _index_items_by_canonical_form(items) order_items = _translate_order_by( parsed.args.get("order"), item_by_projected_name, strip_prefix=strip_prefix, ) diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 826b4a8d..c7a822ff 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1196,6 +1196,119 @@ def test_cast_aliased_projection_group_by_unaliased_rejected(dialect) -> None: ) +# --- Codex round 3: lossy-pair rejection via alias --------------------------- +# +# Round 3 dropped CAST canonical-form registration so unaliased +# ORDER BY / GROUP BY CAST(...) fail with "not in projection list". But the +# aliased path (SELECT CAST(c AS T) AS x ... ORDER BY x) still resolves via +# the bare alias; it sorts/groups by the bare column, which for lossy pairs +# (X→TEXT for ORDER BY, TIMESTAMP→DATE for GROUP BY) silently disagrees with +# the casted semantics. These pairs are rejected at order/group-by time; +# every other admitted pair (1:1, identity, INT→DOUBLE, DATE↔TIMESTAMP) +# stays admitted via alias because the underlying engine value preserves +# the casted order/grouping. + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("id", "TEXT"), # INT → TEXT + ("revenue", "TEXT"), # DOUBLE → TEXT + ("is_paid", "TEXT"), # BOOLEAN → TEXT + ("delivered_at", "TEXT"), # DATE → TEXT + ("ordered_at", "TEXT"), # TIMESTAMP → TEXT + ], +) +def test_cast_alias_order_by_lossy_pair_rejected( + col: str, target: str, dialect, +) -> None: + """SELECT CAST( AS TEXT) AS x ... ORDER BY x is admitted in + projection but rejected at ORDER BY resolution — the engine query still + sorts by the bare column's natural type (numeric/temporal) rather than + the casted type's lex order.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=( + f"SELECT CAST({col} AS {target}) AS x FROM orders ORDER BY x" + ), + catalog=_catalog(), dialect=dialect, + ) + msg = str(exc_info.value) + assert "ORDER BY on CAST projection" in msg + assert "lossy pair" in msg + + +def test_cast_alias_group_by_lossy_timestamp_to_date_rejected(dialect) -> None: + """The only many-to-one admitted pair: SELECT CAST(ordered_at AS DATE) + AS d ... GROUP BY d would group per-timestamp but advertise per-date, + surfacing duplicate dates. Rejected.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=( + "SELECT CAST(ordered_at AS DATE) AS d, COUNT(*) FROM orders " + "GROUP BY d" + ), + catalog=_catalog(), dialect=dialect, + ) + msg = str(exc_info.value) + assert "GROUP BY on CAST projection" in msg + assert "lossy pair" in msg + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("delivered_at", "TIMESTAMP"), # DATE → TIMESTAMP (1:1) + ("ordered_at", "DATE"), # TIMESTAMP → DATE — only for ORDER BY + ("id", "DOUBLE"), # INT → DOUBLE (1:1 within INT range) + ("delivered_at", "DATE"), # identity DATE → DATE + ("revenue", "DOUBLE"), # identity DOUBLE → DOUBLE + ], +) +def test_cast_alias_order_by_safe_pair_admitted( + col: str, target: str, dialect, +) -> None: + """Order-preserving pairs (1:1 / identity / DATE↔TIMESTAMP) stay + admitted via alias because the engine column's natural sort matches + the casted type's sort. The wire-type override still applies.""" + result = translate( + sql=( + f"SELECT CAST({col} AS {target}) AS x FROM orders ORDER BY x" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].direction == "asc" + assert result.projection_types == [DataType(target)] + + +@pytest.mark.parametrize( + ("col", "target"), + [ + ("delivered_at", "TIMESTAMP"), # DATE → TIMESTAMP (1:1) + ("id", "DOUBLE"), # INT → DOUBLE + ("revenue", "DOUBLE"), # identity + ("delivered_at", "DATE"), # identity + ("status", "TEXT"), # identity TEXT → TEXT + ], +) +def test_cast_alias_group_by_safe_pair_admitted( + col: str, target: str, dialect, +) -> None: + """1:1 / identity pairs preserve the per-engine-column grouping under + the casted type, so GROUP BY stays admitted.""" + result = translate( + sql=( + f"SELECT CAST({col} AS {target}) AS x, COUNT(*) FROM orders " + f"GROUP BY x" + ), + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.projection_types[0] == DataType(target) + + def test_cast_metric_projection_overrides_wire_type(dialect) -> None: """A CAST around a metric reference resolves through the metric path (`_record_metric`), and the cast target wins over the declared metric type.""" From c096f026a363cf12d0db67a32ab254af024b3318 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 17:05:29 +0200 Subject: [PATCH 06/13] DEV-1566 PR #185 round 5: extend lossy-CAST rejection to unknown-source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 4: round-4's lossy-pair check skipped unknown-source CAST projections (custom metrics without a declared data_type). The ``_is_admitted_cast`` rule lets ``None -> TEXT`` through, and the same lex-vs-natural ORDER BY argument applies — silently sorting by the engine's arbitrary value type. Refactor: - Split the source lookup from the lossy-pair check. ``_item_cast_source_type`` now returns ``None`` only when the source is genuinely unknown (caller pre-checks ``item.cast_target`` to disambiguate). - Add ``_cast_pair_is_lossy(source, target, lossy_pairs)`` — when ``source is None`` it treats ``target=TEXT`` as lossy (the only path ``_is_admitted_cast`` lets through), preserving ORDER BY safety even for declared-typeless metrics. - ``_lossy_cast_error_message`` now accepts ``Optional[DataType]`` for ``source`` and renders ``unknown`` when missing. GROUP BY path is unaffected by the unknown-source case: unknown-source casts only project metrics, and metric aliases never enter ``derived_dims``, so the strict-on-extras membership check already rejects them before the lossy-pair branch fires. Tests: - ``test_cast_unknown_source_datatype_admits_text_only`` extended with the ORDER BY rejection assertion and the explanatory note about the metric/GROUP BY interaction. Co-Authored-By: Claude Opus 4.7 --- slayer/facade/translator.py | 63 ++++++++++++++++++++++++--------- tests/facade/test_translator.py | 19 ++++++++++ 2 files changed, 65 insertions(+), 17 deletions(-) diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index c24bd0db..3d083b50 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -1394,10 +1394,11 @@ def _flip_comparator(op_sql: str) -> str: def _item_cast_source_type(item: _ProjectionItem) -> Optional[DataType]: """Return the underlying source DataType of a CAST-projected item, or - None when the item isn't a CAST projection. Used to look up the - (source, target) pair against the lossy-CAST sets.""" - if item.cast_target is None: - return None + None when the source type is unknown (custom metric without a declared + ``data_type``). Caller MUST gate on ``item.cast_target is not None`` + before calling this — the helper's None return is intentionally + overloaded with the unknown-source case so the lossy-pair check can + treat unknown-source casts as lossy by default.""" if item.metric is not None: return item.metric.data_type if item.dimension is not None: @@ -1405,15 +1406,34 @@ def _item_cast_source_type(item: _ProjectionItem) -> Optional[DataType]: return None +def _cast_pair_is_lossy( + *, source: Optional[DataType], target: DataType, + lossy_pairs: frozenset, +) -> bool: + """A cast is lossy when either: (a) the source is known and (source, + target) is in ``lossy_pairs``, or (b) the source is unknown — the only + admitted unknown-source target is TEXT (see ``_is_admitted_cast``) + which is always lossy for ORDER BY (lex vs engine's natural sort). + For GROUP BY the unknown-source path is admitted because the engine's + bare-value grouping preserves whatever per-value distinction TEXT + would expose.""" + if source is None: + return target is DataType.TEXT + return (source, target) in lossy_pairs + + def _lossy_cast_error_message( - *, kind: str, name: str, source: DataType, target: DataType, + *, kind: str, name: str, source: Optional[DataType], target: DataType, ) -> str: """One-line user-facing message for the lossy-CAST-pair rejection. The ``kind`` is ``ORDER BY`` or ``GROUP BY``; ``name`` is the user-visible - identifier the message points at (typically the alias).""" + identifier the message points at (typically the alias). ``source`` is + None when the underlying metric/dimension has no declared data_type + (custom metric path), which is treated as lossy by default.""" + source_str = str(source) if source is not None else "unknown" return ( f"{kind} on CAST projection {name!r} with lossy pair " - f"{source!s}→{target!s} is unsupported: the engine query projects " + f"{source_str}→{target!s} is unsupported: the engine query projects " f"the bare column, so the underlying sort/group semantics don't " f"match the casted type. Use the bare column, or wait for engine-" f"side CAST pushdown." @@ -1441,14 +1461,19 @@ def _translate_order_by( item = item_by_projected_name[name] # DEV-1566 — Codex round 3: reject ORDER BY on CAST projections whose # (source, target) pair is lossy under bare-column sort semantics. - source = _item_cast_source_type(item) - if source is not None and item.cast_target is not None and ( - (source, item.cast_target) in _LOSSY_ORDER_BY_CAST_PAIRS - ): - raise TranslationError(_lossy_cast_error_message( - kind="ORDER BY", name=name, + # Unknown-source casts (custom metrics without a declared data_type) + # are treated as lossy — the only admitted unknown-source target is + # TEXT, and the lex-vs-natural sort gap applies. + if item.cast_target is not None: + source = _item_cast_source_type(item) + if _cast_pair_is_lossy( source=source, target=item.cast_target, - )) + lossy_pairs=_LOSSY_ORDER_BY_CAST_PAIRS, + ): + raise TranslationError(_lossy_cast_error_message( + kind="ORDER BY", name=name, + source=source, target=item.cast_target, + )) if item.metric is not None: ref = ColumnRef(name=item.metric.name) else: @@ -1526,12 +1551,16 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e ) # DEV-1566 — Codex round 3: reject GROUP BY on CAST projections # whose (source, target) pair is lossy under bare-column grouping. + # Unknown-source casts (only admitted as None→TEXT) are admitted + # here — the engine's per-value grouping already collapses to the + # same set of TEXT representations. item = item_by_projected_name.get(u) - if item is None: + if item is None or item.cast_target is None: continue source = _item_cast_source_type(item) - if source is not None and item.cast_target is not None and ( - (source, item.cast_target) in _LOSSY_GROUP_BY_CAST_PAIRS + if _cast_pair_is_lossy( + source=source, target=item.cast_target, + lossy_pairs=_LOSSY_GROUP_BY_CAST_PAIRS, ): raise TranslationError(_lossy_cast_error_message( kind="GROUP BY", name=u, diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index c7a822ff..aad8ef00 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1054,6 +1054,25 @@ def test_cast_unknown_source_datatype_admits_text_only(dialect) -> None: ) assert "Unsupported CAST" in str(exc_info.value) + # Codex round 4: ORDER BY on the alias of an unknown-source CAST→TEXT + # must be rejected too — the lex-vs-natural sort gap applies regardless + # of whether the underlying source type was declared. The error message + # surfaces "unknown" for the source. (GROUP BY on a metric alias is + # already rejected by the strict-on-extras membership check — only + # dimensions populate derived_dims — so the lossy GROUP BY path can't + # be triggered through an unknown-source metric.) + with pytest.raises(TranslationError) as exc_info: + translate( + sql=( + "SELECT CAST(custom_metric AS TEXT) AS m FROM orders " + "ORDER BY m" + ), + catalog=catalog, dialect=dialect, + ) + msg = str(exc_info.value) + assert "ORDER BY on CAST projection" in msg + assert "unknown" in msg + @pytest.mark.parametrize( "col", ["delivered_at", "ordered_at", "id", "revenue", "is_paid", "status"], From 38d6b9fc26bd806504e424ed3a88433a182871f9 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 17:19:28 +0200 Subject: [PATCH 07/13] DEV-1566 PR #185 round 6: add INT->DOUBLE to lossy GROUP BY set Codex round 5: int64 has precise IEEE 754 float64 representation only up to +-2^53; larger bigints (e.g. 9007199254740992 vs 9007199254740993) collapse to the same double. Postgres-side `GROUP BY cast(id AS DOUBLE)` would merge them; the facade groups by the bare int and over-reports groups. Same many-to-one shape as TIMESTAMP -> DATE. - Add `(DataType.INT, DataType.DOUBLE)` to `_LOSSY_GROUP_BY_CAST_PAIRS`. - Parametrise the GROUP BY rejection test over both lossy pairs. - Remove `(id, DOUBLE)` from the GROUP BY safe-pair table. - Doc table updated. ORDER BY is unaffected: IEEE 754 preserves the relative order of int64 values even when distinct bigints collapse to the same double, so the ORDER BY result is non-strict but not wrong. Co-Authored-By: Claude Opus 4.7 --- docs/interfaces/pg-facade.md | 8 ++++---- slayer/facade/translator.py | 13 ++++++++++--- tests/facade/test_translator.py | 30 +++++++++++++++++++++++------- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index e8de6e66..0f75363c 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -173,10 +173,10 @@ Pairs that **fail** the aliased-reference admission and raise `ORDER BY on CAST projection '...' with lossy pair X→T is unsupported` (symmetric message for GROUP BY): -| Path | Lossy pairs | -|----------|-----------------------------------------------------------------| -| ORDER BY | `X → TEXT` for every `X` (lex sort ≠ engine's natural sort) | -| GROUP BY | `TIMESTAMP → DATE` (many-to-one rollup) | +| Path | Lossy pairs | +|----------|--------------------------------------------------------------------------| +| ORDER BY | `X → TEXT` for every `X` (lex sort ≠ engine's natural sort) | +| GROUP BY | `TIMESTAMP → DATE` (many-to-one rollup); `INT → DOUBLE` (IEEE 754 collapse beyond ±2^53) | Every other admitted pair — identity (`X → X`), `DATE → TIMESTAMP`, `TIMESTAMP → DATE` for ORDER BY, `INT → DOUBLE` — preserves the casted diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index 3d083b50..f2fd33fc 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -292,11 +292,18 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: (DataType.DATE, DataType.TEXT), (DataType.TIMESTAMP, DataType.TEXT), }) -# GROUP BY lossy: only the many-to-one TIMESTAMP→DATE rollup. Every other -# admitted pair is a 1:1 / identity mapping, so per-engine-column grouping -# already collapses to the same set of casted groups. +# GROUP BY lossy: many-to-one casts where the engine-column grouping +# returns MORE groups than the casted column would. +# - TIMESTAMP → DATE: multiple timestamps per date. +# - INT → DOUBLE: int64 has precise range ±2^53 in IEEE 754 float64; +# larger bigints lose precision so distinct ints can collapse to the +# same double under Postgres's GROUP BY semantics, but the facade +# groups by the bare int and over-reports groups. +# Every other admitted pair is a 1:1 / identity mapping within the +# supported value range. _LOSSY_GROUP_BY_CAST_PAIRS: frozenset = frozenset({ (DataType.TIMESTAMP, DataType.DATE), + (DataType.INT, DataType.DOUBLE), }) diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index aad8ef00..96d9e758 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1257,21 +1257,38 @@ def test_cast_alias_order_by_lossy_pair_rejected( assert "lossy pair" in msg -def test_cast_alias_group_by_lossy_timestamp_to_date_rejected(dialect) -> None: - """The only many-to-one admitted pair: SELECT CAST(ordered_at AS DATE) - AS d ... GROUP BY d would group per-timestamp but advertise per-date, - surfacing duplicate dates. Rejected.""" +@pytest.mark.parametrize( + ("col", "target", "expected_source", "expected_target"), + [ + # TIMESTAMP → DATE: many timestamps per date. + ("ordered_at", "DATE", "TIMESTAMP", "DATE"), + # Codex round 5: INT → DOUBLE is also lossy for grouping because + # IEEE 754 float64 cannot represent every int64 distinctly; large + # bigints collapse to the same double under Postgres semantics + # while the facade groups by the bare int and over-reports groups. + ("id", "DOUBLE", "INT", "DOUBLE"), + ], +) +def test_cast_alias_group_by_lossy_pair_rejected( + col: str, target: str, expected_source: str, expected_target: str, + dialect, +) -> None: + """Many-to-one admitted pairs (TIMESTAMP→DATE, INT→DOUBLE) are rejected + in GROUP BY alias paths — the facade's per-engine-column grouping + returns more groups than Postgres semantics would produce.""" with pytest.raises(TranslationError) as exc_info: translate( sql=( - "SELECT CAST(ordered_at AS DATE) AS d, COUNT(*) FROM orders " - "GROUP BY d" + f"SELECT CAST({col} AS {target}) AS d, COUNT(*) FROM orders " + f"GROUP BY d" ), catalog=_catalog(), dialect=dialect, ) msg = str(exc_info.value) assert "GROUP BY on CAST projection" in msg assert "lossy pair" in msg + assert expected_source in msg + assert expected_target in msg @pytest.mark.parametrize( @@ -1306,7 +1323,6 @@ def test_cast_alias_order_by_safe_pair_admitted( ("col", "target"), [ ("delivered_at", "TIMESTAMP"), # DATE → TIMESTAMP (1:1) - ("id", "DOUBLE"), # INT → DOUBLE ("revenue", "DOUBLE"), # identity ("delivered_at", "DATE"), # identity ("status", "TEXT"), # identity TEXT → TEXT From 3aa82b967f2f3900a6044ed1668918a7cbcdc4b7 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 17:39:10 +0200 Subject: [PATCH 08/13] DEV-1566 PR #185 round 7: Sonar cleanup + fix two unrelated test failures Sonar (round 6 carryover): - S3358 on translator.py:954-957: replaced the metric/dim chained-ternary in `_resolve_column_cast_projection` with a call to the existing `_item_cast_source_type(inner)` helper. Drops one NOSONAR comment and unifies the source-type lookup with the helper added in round 4. - S3776 on `_translate_order_by`: extracted the lossy-CAST check into a new `_reject_lossy_cast_or_pass(...)` helper shared with `_validate_group_by`. Drops complexity below the 15-line budget on both functions without changing behaviour. Two unrelated test failures that turned up on this branch: - tests/dialects/test_byte_equivalence.py: the ClickHouse snapshot expected `DATE_TRUNC` but sqlglot now emits the canonical lowercase `dateTrunc` form (ClickHouse accepts both). Updated the snapshot + added a comment about the canonical form. - tests/facade/test_catalog_sql.py: `pg_catalog.current_user` / `pg_catalog.current_catalog` weren't being substituted because sqlglot parses the no-parens form as `Column(this=, table='pg_catalog')`, not as a `Dot`. The existing rewriter only matched the `Dot` shape. Added a sibling `_substitute_qualified_context_column` branch and wired it into `_substitute_context_functions`. The Dot-shaped variants (`pg_catalog.current_database()`, `current_schema()`) continue to route through the existing branch. Co-Authored-By: Claude Opus 4.7 --- slayer/facade/catalog_sql.py | 26 +++++++++++ slayer/facade/translator.py | 58 ++++++++++++------------- tests/dialects/test_byte_equivalence.py | 6 ++- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/slayer/facade/catalog_sql.py b/slayer/facade/catalog_sql.py index ec3de9bb..919a35a3 100644 --- a/slayer/facade/catalog_sql.py +++ b/slayer/facade/catalog_sql.py @@ -1203,6 +1203,7 @@ def _substitute_context_functions(self, node: exp.Expression) -> exp.Expression: # Try each substitution branch in order; first hit wins. substituted = ( self._substitute_qualified_context_call(node) + or self._substitute_qualified_context_column(node) or self._substitute_dedicated_func(node) or self._substitute_bareword_column(node) or self._substitute_anonymous_function(node) @@ -1234,6 +1235,31 @@ def _substitute_qualified_context_call( or self._substitute_anonymous_function(rhs) ) + def _substitute_qualified_context_column( + self, node: exp.Expression, + ) -> Optional[exp.Expression]: + """Replace ``pg_catalog.`` where sqlglot parses the + whole thing as ``Column(this=, table='pg_catalog')`` — the + no-parens shape (``pg_catalog.current_user``, + ``pg_catalog.current_catalog``). The Dot-shaped variant + (``pg_catalog.current_database()``) is handled by + ``_substitute_qualified_context_call``. + """ + if not isinstance(node, exp.Column): + return None + table = node.args.get("table") + if table is None: + return None + table_name = ( + str(table.this) if hasattr(table, "this") else str(table) + ).lower() + if table_name != "pg_catalog": + return None + ident = node.this + if not isinstance(ident, exp.Identifier): + return None + return self._literal_for_context_name(str(ident.this).lower()) + def _substitute_dedicated_func(self, node: exp.Expression) -> Optional[exp.Expression]: """Dedicated sqlglot Func subclasses (typed nodes for niladic ctx fns).""" if isinstance(node, (exp.CurrentDatabase, getattr(exp, "CurrentCatalog", exp.CurrentDatabase))): diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index f2fd33fc..df754eb7 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -951,11 +951,7 @@ def _resolve_column_cast_projection( metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, strip_prefix=strip_prefix, ) - source: Optional[DataType] = ( # NOSONAR(S3358) — metric→dim→None fallback; chained ternary is the idiomatic shape - inner.metric.data_type if inner.metric is not None - else inner.dimension.data_type if inner.dimension is not None - else None - ) + source = _item_cast_source_type(inner) if not _is_admitted_cast(source=source, target=cast_target): offending = exp.Cast(this=body, to=exp.DataType.build(cast_target.value)).sql() raise TranslationError( @@ -1447,6 +1443,26 @@ def _lossy_cast_error_message( ) +def _reject_lossy_cast_or_pass( + *, kind: str, name: str, item: _ProjectionItem, lossy_pairs: frozenset, +) -> None: + """If ``item`` is a CAST projection AND its (source, target) pair is + lossy under bare-column ORDER BY / GROUP BY semantics, raise. Otherwise + no-op. Unknown-source casts (custom metrics without a declared + data_type) are admitted as None→TEXT only; ``_cast_pair_is_lossy`` + treats that case as lossy for ORDER BY by default.""" + if item.cast_target is None: + return + source = _item_cast_source_type(item) + if not _cast_pair_is_lossy( + source=source, target=item.cast_target, lossy_pairs=lossy_pairs, + ): + return + raise TranslationError(_lossy_cast_error_message( + kind=kind, name=name, source=source, target=item.cast_target, + )) + + def _translate_order_by( order: Optional[exp.Order], item_by_projected_name: Dict[str, _ProjectionItem], @@ -1466,21 +1482,10 @@ def _translate_order_by( f"ORDER BY column {name!r} is not in the projection list" ) item = item_by_projected_name[name] - # DEV-1566 — Codex round 3: reject ORDER BY on CAST projections whose - # (source, target) pair is lossy under bare-column sort semantics. - # Unknown-source casts (custom metrics without a declared data_type) - # are treated as lossy — the only admitted unknown-source target is - # TEXT, and the lex-vs-natural sort gap applies. - if item.cast_target is not None: - source = _item_cast_source_type(item) - if _cast_pair_is_lossy( - source=source, target=item.cast_target, - lossy_pairs=_LOSSY_ORDER_BY_CAST_PAIRS, - ): - raise TranslationError(_lossy_cast_error_message( - kind="ORDER BY", name=name, - source=source, target=item.cast_target, - )) + _reject_lossy_cast_or_pass( + kind="ORDER BY", name=name, item=item, + lossy_pairs=_LOSSY_ORDER_BY_CAST_PAIRS, + ) if item.metric is not None: ref = ColumnRef(name=item.metric.name) else: @@ -1562,17 +1567,12 @@ def _validate_group_by( # NOSONAR(S3776) — single GROUP BY validation pass; e # here — the engine's per-value grouping already collapses to the # same set of TEXT representations. item = item_by_projected_name.get(u) - if item is None or item.cast_target is None: + if item is None: continue - source = _item_cast_source_type(item) - if _cast_pair_is_lossy( - source=source, target=item.cast_target, + _reject_lossy_cast_or_pass( + kind="GROUP BY", name=u, item=item, lossy_pairs=_LOSSY_GROUP_BY_CAST_PAIRS, - ): - raise TranslationError(_lossy_cast_error_message( - kind="GROUP BY", name=u, - source=source, target=item.cast_target, - )) + ) def _is_ignorable_group_item(item: str) -> bool: diff --git a/tests/dialects/test_byte_equivalence.py b/tests/dialects/test_byte_equivalence.py index b55b23c3..d4324130 100644 --- a/tests/dialects/test_byte_equivalence.py +++ b/tests/dialects/test_byte_equivalence.py @@ -166,13 +166,15 @@ async def test_byte_equivalence_basic_query( ' DATE_TRUNC(\'MONTH\', orders.created_at)', ), ( + # ClickHouse accepts both ``dateTrunc`` and ``DATE_TRUNC``; sqlglot + # now emits the canonical lowercase ClickHouse form. "clickhouse", 'SELECT\n' - ' DATE_TRUNC(\'MONTH\', orders.created_at) AS "orders.created_at",\n' + ' dateTrunc(\'MONTH\', orders.created_at) AS "orders.created_at",\n' ' SUM(orders.amount) AS "orders.revenue_sum"\n' 'FROM public.orders AS orders\n' 'GROUP BY\n' - ' DATE_TRUNC(\'MONTH\', orders.created_at)', + ' dateTrunc(\'MONTH\', orders.created_at)', ), ( "mysql", From 7835920359027be49c52855edfcc0fb934af280c Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 18:05:37 +0200 Subject: [PATCH 09/13] docs/database-support.md: cover all Tier 1 dbs + extend aggregation table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Tier 1 table reshaped into (engine, live test, Docker example) columns. Every Tier 1 engine that gained a testcontainers integration suite on main (ClickHouse, MySQL, SQL Server) is now listed under "live test" rather than "Docker example only". - BigQuery added to Tier 1 with the caveat that its CI coverage is the example's verify.py (no pytest-style integration suite yet) — clarified the trade-off explicitly so readers know it's a shallower tier than the testcontainers suites. - Aggregation support table extended with SQL Server (T-SQL) and BigQuery rows. T-SQL: median/percentile raise; stddev_*/var_* native via STDEV/STDEVP/VAR/VARP renames; corr/covar_* via variance decomposition. BigQuery: median/percentile fail at runtime (PERCENTILE_CONT is analytic-only); stddev_*/var_*/corr/covar_* native. - New SQL Server (T-SQL) caveats subsection covering the dialect's STDEV/STDEVP renames, variance-decomposition for corr/covar_*, median/percentile NotImplementedError, DATETRUNC/DATEADD, bracketed-alias mangling. - New BigQuery caveats subsection covering FK introspection limits, dotted-alias mangling, the median/percentile gap with APPROX_QUANTILES workaround, and the explain=True no-op behaviour. Co-Authored-By: Claude Opus 4.7 --- docs/database-support.md | 103 ++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 12 deletions(-) diff --git a/docs/database-support.md b/docs/database-support.md index af2da54d..3414695e 100644 --- a/docs/database-support.md +++ b/docs/database-support.md @@ -5,23 +5,35 @@ SQL generation. Databases are supported at two tiers. ## Tier 1 — fully tested -Integration tests and/or Docker examples; must not regress. - -| Engine | Coverage | -|---|---| -| **SQLite** | Integration tests in `tests/integration/test_integration.py`; embedded example. | -| **Postgres** | Integration tests in `tests/integration/test_integration_postgres.py`; Docker example. | -| **DuckDB** | Integration tests in `tests/integration/test_integration_duckdb.py` (in-process, no Docker). | -| **MySQL** | Docker example with `verify.py`. | -| **ClickHouse** | Docker example with `verify.py`. | -| **SQL Server** | Docker example with `verify.py` in `examples/sqlserver/`. | -| **Snowflake** | Integration tests in `tests/integration/test_integration_snowflake.py` (skip without `~/.snowflake/connections.toml`); `examples/snowflake/` ships `README.md` + `verify.py`. No Docker (no free local image). | +Live-instance integration tests must not regress. Where Docker images exist, +the suites spin up the engine via `testcontainers`; the cloud-only engines +(BigQuery, Snowflake) skip cleanly when credentials aren't available and run +against the live service in CI when they are. + +| Engine | Live test | Docker example | +|---|---|---| +| **SQLite** | `tests/integration/test_integration.py` (in-process) | `examples/embedded/` | +| **Postgres** | `tests/integration/test_integration_postgres.py` (pytest-postgresql, spawned temp instance) | `examples/postgres/` | +| **DuckDB** | `tests/integration/test_integration_duckdb.py` (in-process) | `examples/embedded/` (DuckDB mode) | +| **MySQL** | `tests/integration/test_integration_mysql.py` (`testcontainers[mysql]`) | `examples/mysql/` | +| **ClickHouse** | `tests/integration/test_integration_clickhouse.py` (`testcontainers[clickhouse]`) | `examples/clickhouse/` | +| **SQL Server** | `tests/integration/test_integration_sqlserver.py` (`testcontainers`, `msodbcsql18` + `unixodbc-dev` on the runner) | `examples/sqlserver/` | +| **Snowflake** | `tests/integration/test_integration_snowflake.py` (skips without `~/.snowflake/connections.toml`; profile name overridable via `$SLAYER_SNOWFLAKE_CONNECTION`) | `examples/snowflake/` (no Docker) | +| **BigQuery** | `examples/bigquery/verify.py` driven by CI against `bigquery-public-data.thelook_ecommerce` (gated on `GCP_PROJECT_ID` / `GCP_SA_KEY_B64` repo secrets) | `examples/bigquery/` (no Docker — managed service) | + +BigQuery does not yet have a pytest-style integration suite; its CI coverage +runs the example's `verify.py` directly via `.github/workflows/ci.yml`. That +exercises auto-ingestion, basic projection, joins, time-grain dimensions, and +the cardinality / sum-of-grouped-equals-total invariants — enough to catch +emitted-SQL regressions, but the verify-script tier is shallower than the +testcontainers suites. ## Tier 2 — code-covered Unit tests for SQL generation; no live-instance verification. -BigQuery, Redshift, Trino/Presto, Databricks/Spark, Oracle. +Redshift, Trino/Presto (Athena uses the Presto dialect), Databricks/Spark, +Oracle. ## Aggregation support @@ -40,6 +52,8 @@ because no standard syntax works everywhere: | ClickHouse | yes | yes | yes | yes | Native `median(x)`, parametric `quantile(p)(x)`, native `stddev_*`/`var_*`/`corr`/`covar*` (camelCase variants emitted by sqlglot for `var_samp`). | | Snowflake | yes | yes | yes | yes | Native `MEDIAN`, `PERCENTILE_CONT(p) WITHIN GROUP`, `STDDEV_*`/`VAR_*`/`CORR`/`COVAR_*`. `LOG10` native; no native `LOG2` (falls through to `LOG(2, x)`). | | MySQL | **no** | **no** | yes | **no** | No native `MEDIAN`/`PERCENTILE_CONT`/`CORR`/`COVAR_*` and no Python-UDF mechanism — SLayer raises `NotImplementedError` for those. `STDDEV_SAMP`/`STDDEV_POP`/`VAR_SAMP`/`VAR_POP` are native on MySQL. Use MariaDB or compute the unsupported aggregations client-side. | +| SQL Server (T-SQL) | **no** | **no** | yes | yes (decomposed) | `MEDIAN` doesn't exist and T-SQL's `PERCENTILE_CONT` is window-only (no `WITHIN GROUP` aggregate form) — SLayer raises `NotImplementedError`. Native `STDEV`/`STDEVP`/`VAR`/`VARP` (slayer renames the canonical `STDDEV_*`/`VAR_*` names at emit time). `CORR`/`COVAR_*` use the same variance-decomposition formula as MySQL (`cov(x,y) = (var(x+y) − var(x) − var(y)) / 2`, `corr = cov / (stddev(x) · stddev(y))`). | +| BigQuery | **no** | **no** | yes | yes | BigQuery has no `MEDIAN` aggregate, and its `PERCENTILE_CONT` is analytic-only (no `WITHIN GROUP` syntax) — the base class emit `PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY x)` fails at runtime. If you need percentile on BigQuery, define a custom `Aggregation` using `APPROX_QUANTILES(x, 100)[OFFSET(N)]`. Native `STDDEV_SAMP`/`STDDEV_POP`/`VAR_SAMP`/`VAR_POP`/`CORR`/`COVAR_SAMP`/`COVAR_POP` (sqlglot may emit `VARIANCE` for `var_samp`). | ### SQLite caveats @@ -105,6 +119,37 @@ If you need percentiles on MySQL, the recommended options are: - Define a custom `Aggregation` on the model with whatever `GROUP_CONCAT`- based or windowed expression suits your data shape and group sizes. +### SQL Server (T-SQL) caveats + +T-SQL has `STDEV`/`STDEVP`/`VAR`/`VARP` (not `STDDEV_SAMP`/`STDDEV_POP`/ +`VAR_SAMP`/`VAR_POP`); sqlglot's tsql transpiler emits incorrect names like +`VAR_SAMP` and `VARIANCE_POP`, so the T-SQL dialect overrides the canonical +spellings via `Anonymous` rewrites in `slayer/sql/dialects/tsql.py`. + +`CORR`/`COVAR_SAMP`/`COVAR_POP` are derived from variance: +`cov(x, y) = (var(x + y) − var(x) − var(y)) / 2`, +`corr = cov / (stddev(x) · stddev(y))`. The decomposition is shared with +MySQL via `_build_covar_decomposition` in `slayer/sql/dialects/base.py`. + +`MEDIAN` doesn't exist, and `PERCENTILE_CONT` in T-SQL is a window function +only — there is no `WITHIN GROUP` aggregate form. SLayer raises +`NotImplementedError` for both at SQL generation time. Use the windowed form +as a custom `Aggregation` if you need it, or compute client-side. + +Other T-SQL specifics surfaced by the dialect: + +- `DATETRUNC(unit, col)` for time-grain dimensions (SQL Server 2022+ — + earlier versions don't have `DATETRUNC` and aren't supported). +- `DATETRUNC(iso_week, col)` for Monday-aligned week truncation — + `@@DATEFIRST`-independent so the bucketing is deterministic. +- `DATEADD(unit, n, col)` for time-shift arithmetic — T-SQL has no + `INTERVAL` literal. +- Bracketed `[ident]` quoting — `.` SLayer aliases get + mangled to `___` at emit and decoded back on result-row + keys (mirror of the BigQuery `___` mangling; see DEV-1571). +- Native `LOG10`, no native `LOG2` (`log2(x)` falls through to the + canonical 2-arg `LOG(2, x)` form). + ### Snowflake caveats Snowflake is a fully managed cloud warehouse — no Docker, no local instance. @@ -130,6 +175,40 @@ Snowflake](configuration/datasources.md#snowflake) for connection setup. canonical 2-arg `LOG(2, x)` form. `LOG10` and the rest of the math / statistical functions are native. +### BigQuery caveats + +BigQuery is a fully managed cloud warehouse — no Docker, no local instance. +CI runs the example's `verify.py` against `bigquery-public-data.thelook_ecommerce`, +gated on `GCP_PROJECT_ID` and `GCP_SA_KEY_B64` repo secrets (forks without +them skip cleanly). Auth via Google Application Default Credentials +(`$GOOGLE_APPLICATION_CREDENTIALS` pointing at a service-account JSON key, +plus `$GCP_PROJECT_ID` for billing). The `bigquery://` driver requires the +`sqlalchemy-bigquery` extra. + +- **No FK introspection.** BigQuery exposes no foreign-key metadata via + `INFORMATION_SCHEMA`, so auto-ingestion cannot discover joins. Hand-declare + `ModelJoin`s on the model. +- **Dotted alias mangling.** BigQuery rejects column names containing `.` + (output schema names must match `[A-Za-z_][A-Za-z0-9_]*`), so SLayer + rewrites `.` aliases (`orders._count`, + `orders.products.category`) to `___` at emit time and + reverses the mapping on result rows. The triple-underscore separator is + distinct from `__` (used by `_query_as_model` for cross-model leaf + flattening), so the two encodings never collide. In `Column.sql`, + fully-qualified table paths must be backticked per-segment + (`` `project`.`dataset`.`table` ``) — a single backticked dotted path of + word-only segments (`` `my_dataset.my_table` ``) would false-positive + mangle. +- **No `MEDIAN` aggregate; `PERCENTILE_CONT` is analytic-only.** Both + raise at SQL generation time (sqlglot doesn't transpile the base class's + `PERCENTILE_CONT(p) WITHIN GROUP (ORDER BY x)` to BigQuery's analytic + form). Use a custom `Aggregation` with `APPROX_QUANTILES(x, 100)[OFFSET(N)]` + when you need it. +- **No native EXPLAIN.** BigQuery has no SQL-level `EXPLAIN`. The + `BigqueryDialect.explain_prefix` is `None`, so `engine.execute(..., + explain=True)` returns the dry-run SQL unchanged rather than an execution + plan. + ## Adding a new dialect 1. Add the mapping to `slayer/engine/query_engine.py:_dialect_for_type()`. From a3ece6ab47cc8264149721566dc12bb8de375628 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 18:19:56 +0200 Subject: [PATCH 10/13] DEV-1566 PR #185 round 8: thread alias_map through CAST + drop bare-col secondary key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 7 follow-up — two correctness gaps in the merged code. Finding 1 (major): `_resolve_column_cast_projection` did not forward `alias_map` to `_resolve_column_projection`, so Metabase-style joined CAST refs (`CAST("Stores"."name" AS TEXT)`) failed with "Unknown projection item" even though the non-CAST shape worked. Thread `alias_map` through the helper signature and the caller in `_resolve_projection`. Finding 2 (minor): `_build_item_index` registered every aliased dimension's `dimension_ref` (the bare dotted column name) as a secondary key. For `SELECT CAST(id AS TEXT) AS x ... ORDER BY id`, that routed the ORDER BY to the CAST item and the lossy-pair check rejected it as `INT -> TEXT` — even though the user explicitly named the bare integer column, where the engine ordering would already match. Gate the secondary key on `item.cast_target is None` so CAST items only resolve via their user alias. Tests: - tests/facade/test_translator.py: - `test_cast_joined_column_metabase_alias_resolves`: pins the Metabase-style joined CAST projection through `_metabase_join_sql` + `_join_catalog`. - `test_cast_order_by_bare_column_does_not_shadow_cast_projection`: bare column + CAST alias both projected; ORDER BY bare column routes to the bare projection. - `test_cast_order_by_bare_column_without_bare_projection_fails_cleanly`: ORDER BY a bare column NOT in the projection surfaces the existing strict-on-extras error, NOT the misleading lossy-CAST error. Co-Authored-By: Claude Opus 4.7 --- slayer/facade/translator.py | 21 ++++++++++++-- tests/facade/test_translator.py | 51 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index c8004840..ca150150 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -1077,15 +1077,21 @@ def _resolve_column_cast_projection( metrics_by_name: Dict[str, FacadeMetric], dims_by_name: Dict[str, FacadeDimension], strip_prefix: Optional[Tuple[str, str]] = None, + alias_map: Optional[Dict[str, str]] = None, ) -> _ProjectionItem: """Resolve ``CAST( AS )`` to a projection item that runs the bare column through the engine and overrides the wire OID via ``cast_target``. Enforces the strict per-pair coercion allowlist. + + DEV-1565: ``alias_map`` carries the LEFT-JOIN subquery's + ``"Stores"."name"`` → catalog ``stores.name`` rewrite so Metabase-style + joined CAST refs resolve through the same dimension-lookup path as + non-casted joined refs. """ inner = _resolve_column_projection( body=body, alias_name=alias_name, table=table, metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, - strip_prefix=strip_prefix, + strip_prefix=strip_prefix, alias_map=alias_map, ) source = _item_cast_source_type(inner) if not _is_admitted_cast(source=source, target=cast_target): @@ -1282,7 +1288,7 @@ def _resolve_projection( # NOSONAR(S3776) — flat dispatch over projection-exp body=inner_col, cast_target=cast_target, alias_name=alias_name, table=table, metrics_by_name=metrics_by_name, dims_by_name=dims_by_name, - strip_prefix=strip_prefix, + strip_prefix=strip_prefix, alias_map=alias_map, )) continue @@ -2789,7 +2795,15 @@ def _build_item_index(items: List[_ProjectionItem]) -> Dict[str, _ProjectionItem secondary key (time-grain canonical form for `CAST(date_trunc(...))` GROUP BY matches; dimension_ref dotted form for joined-col aliased projections — see DEV-1565). ``setdefault`` preserves the primary - projected_name entry when the secondary key collides with it.""" + projected_name entry when the secondary key collides with it. + + DEV-1566: CAST-projected items are NOT registered under their + ``dimension_ref`` secondary key — that would route ``ORDER BY `` + on a query like ``SELECT CAST(id AS TEXT) AS x ... ORDER BY id`` to the + cast item and trip the lossy-pair rejection, even though the user + explicitly named the bare column (and the engine ordering already + matches that intent). + """ out: Dict[str, _ProjectionItem] = {item.projected_name: item for item in items} for item in items: if item.time_grain is not None and item.time_grain_underlying is not None: @@ -2799,6 +2813,7 @@ def _build_item_index(items: List[_ProjectionItem]) -> Dict[str, _ProjectionItem out.setdefault(canonical, item) elif ( item.dimension is not None + and item.cast_target is None and item.dimension.dimension_ref != item.projected_name ): out.setdefault(item.dimension.dimension_ref, item) diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 1f3886fc..a6c21643 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1349,6 +1349,57 @@ def test_cast_alias_group_by_safe_pair_admitted( assert result.projection_types[0] == DataType(target) +def test_cast_order_by_bare_column_does_not_shadow_cast_projection(dialect) -> None: + """Codex round 8: when both the bare column and a CAST alias are + projected, ``ORDER BY `` must route to the bare column + projection — NOT to the CAST item via the secondary-key fallback that + would then trip the lossy-pair rejection. Validates the + ``item.cast_target is None`` guard in ``_build_item_index``.""" + result = translate( + sql="SELECT id, CAST(id AS TEXT) AS x FROM orders ORDER BY id", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.name == "id" + assert result.query.order[0].direction == "asc" + # Wire schema: bare id keeps INT; the CAST alias surfaces as TEXT. + assert result.projection_types == [DataType.INT, DataType.TEXT] + + +def test_cast_order_by_bare_column_without_bare_projection_fails_cleanly(dialect) -> None: + """Codex round 8: when the bare column is NOT projected, ``ORDER BY + `` over a CAST projection must surface ``not in the + projection list`` (the existing strict-on-extras error) — NOT the + lossy-CAST rejection (which would falsely imply the user asked for + lex-sort semantics on the casted column).""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql="SELECT CAST(id AS TEXT) AS x, status FROM orders ORDER BY id", + catalog=_catalog(), dialect=dialect, + ) + msg = str(exc_info.value) + assert "not in the projection list" in msg + assert "lossy pair" not in msg + + +def test_cast_joined_column_metabase_alias_resolves(dialect) -> None: + """Codex round 8: ``CAST("Stores"."name" AS TEXT)`` in a Metabase-style + LEFT JOIN subquery must resolve through the alias_map (\"Stores\" → + \"stores\"). Before the fix the CAST branch dropped the alias_map and + the joined ref came through as ``Stores.name`` → ``Unknown projection + item``. Now the cast branch forwards alias_map and the projection + resolves to ``stores.name`` like the non-casted shape does.""" + sql = _metabase_join_sql( + projection='CAST("Stores"."name" AS TEXT) AS "store_name"', + ) + result = translate(sql=sql, catalog=_join_catalog(), dialect=dialect) + assert isinstance(result, QueryResult) + assert result.query.dimensions is not None + assert [d.full_name for d in result.query.dimensions] == ["stores.name"] + assert result.projection_types == [DataType.TEXT] + + def test_cast_metric_projection_overrides_wire_type(dialect) -> None: """A CAST around a metric reference resolves through the metric path (`_record_metric`), and the cast target wins over the declared metric type.""" From 90d2c3f8c56b16330bd2f74b5d2640add838261c Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 18:35:30 +0200 Subject: [PATCH 11/13] DEV-1566 PR #185 round 9: restore 33 tests dropped during round-8 merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 8 flagged that the round-7→8 merge silently dropped 33 tests from origin/main when I reconstructed tests/facade/test_translator.py from HEAD + appended main's join section. Restored sections: - DEV-1569 SET / RESET classification on NoOpResult (14 tests: bare/dotted/quoted/cast-wrapped/signed/comma-value/tab-whitespace SET forms, multi-item SET, RESET, BEGIN/COMMIT/ROLLBACK no-capture). - Probe-matcher behaviour (3 tests: outcome with settings mutation, bare RowBatch return, settings-mutation defaults). - SetSettingOp / ResetSettingOp Pydantic-model assertions. - DEV-1568 MBQL aggregation-alias resolution in WHERE/HAVING/ORDER BY (16 tests: count-star/sum/count-distinct alias refs, gte / literal-on- left flips, case-sensitive matching, non-literal rejection, qualified forms, dimension-alias non-routing). - test_sunday_week_wrapper_two_day_offset_rejected. Imports re-added that an earlier ruff --fix had stripped: SetSettingOp, ResetSettingOp, ProbeMatcherOutcome, RowBatch, FacadeColumn. The section insertions duplicated 7 tests that already lived in the file from HEAD — all 7 pairs were byte-identical (verified via difflib), so the second occurrence was removed and one resulting orphan @pytest.mark.parametrize decorator cleaned up. Zero tests are now missing from EITHER side of the merge (verified via comm -23 against grep -E '^def test_' for both /tmp/test_MAIN.py and the pre-merge HEAD). Co-Authored-By: Claude Opus 4.7 --- tests/facade/test_translator.py | 589 ++++++++++++++++++++++++++++++++ 1 file changed, 589 insertions(+) diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index a6c21643..2545cae6 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -16,13 +16,17 @@ from slayer.core.models import Column, ModelJoin, ModelMeasure, SlayerModel from slayer.core.query import ModelExtension from slayer.facade.catalog import FacadeCatalog, build_catalog +from slayer.facade.rows import FacadeColumn, RowBatch from slayer.facade.translator import ( AGG_OVER_MEASURE_MESSAGE, InfoSchemaResult, NoOpResult, + ProbeMatcherOutcome, ProbeResult, QueryResult, READ_ONLY_MESSAGE, + ResetSettingOp, + SetSettingOp, TranslationError, translate, ) @@ -172,6 +176,350 @@ def test_parse_error_translates(dialect) -> None: assert "parse error" in str(exc_info.value).lower() +# --- DEV-1569: SET / RESET capture on NoOpResult, set_config mutation tunneling --- + + +@pytest.mark.parametrize( + ("sql", "expected_name", "expected_value"), + [ + ("SET application_name = 'foo'", "application_name", "foo"), + ("SET application_name TO 'foo'", "application_name", "foo"), + # Postgres clients commonly emit unquoted RHS (`SET client_encoding TO UTF8`); + # sqlglot parses that as a Var, not a Literal. Both must round-trip. + ("SET client_encoding TO UTF8", "client_encoding", "UTF8"), + # SESSION qualifier still resolves the same name/value pair. + ("SET SESSION application_name = 'foo'", "application_name", "foo"), + # LOCAL qualifier is captured but treated as session-scope (no + # transaction-bound restore, per spec). + ("SET LOCAL application_name = 'foo'", "application_name", "foo"), + # Case-insensitive name (lowercased on capture). + ("SET Application_Name = 'foo'", "application_name", "foo"), + # DEFAULT keyword is a Var literally named "DEFAULT" — captured as the + # string "DEFAULT" (no special reset semantics; users wanting reset use + # RESET). + ("SET application_name = DEFAULT", "application_name", "DEFAULT"), + # Empty-string value — Postgres accepts; we accept too. + ("SET application_name = ''", "application_name", ""), + ], +) +def test_classify_set_populates_set_setting( + sql: str, expected_name: str, expected_value: str, dialect, +) -> None: + result = translate(sql=sql, catalog=_catalog(), dialect=dialect) + assert isinstance(result, NoOpResult) + assert result.command_tag == "SET" + assert result.set_setting == SetSettingOp(name=expected_name, value=expected_value) + assert result.reset_setting is None + + +def test_classify_multi_item_set_does_not_capture(dialect) -> None: + """`SET a = 1, b = 2` (multi-item SetItem list) is not a recognized shape; + set_setting=None, command_tag='SET'. Forces the classifier's + 'single SetItem' restriction.""" + result = translate( + sql="SET application_name = 'x', search_path = 'y'", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, NoOpResult) + assert result.command_tag == "SET" + # Multi-item SET is not captured — too uncertain which mutation to apply. + # The connection silently no-ops it (the same outcome as Command-form SET). + assert result.set_setting is None + assert result.reset_setting is None + + +def test_classify_command_form_set_does_not_capture(dialect) -> None: + """sqlglot falls back to ``exp.Command`` for `SET TIME ZONE 'UTC'` and + `SET SESSION CHARACTERISTICS …`. Per spec, those acknowledge silently + with no setting capture (multi-word setting names, not a clean + ` (=|TO) ` pair).""" + for sql in [ + "SET TIME ZONE 'UTC'", + "SET SESSION CHARACTERISTICS AS TRANSACTION ISOLATION LEVEL READ UNCOMMITTED", + ]: + result = translate(sql=sql, catalog=_catalog(), dialect=dialect) + assert isinstance(result, NoOpResult) + assert result.command_tag == "SET" + assert result.set_setting is None + assert result.reset_setting is None + + +@pytest.mark.parametrize( + ("sql", "expected_name", "expected_value"), + [ + # The DEV-1569 / Codex fix: comma-separated values fall back to + # ``exp.Command`` but still carry a clean ` = ` shape. + # pgjdbc / Metabase emit this for `search_path` on every connection. + ("SET search_path = public, extensions", "search_path", "public, extensions"), + ("SET search_path TO public, extensions", "search_path", "public, extensions"), + ("SET search_path TO 'public', 'extensions'", "search_path", "'public', 'extensions'"), + # Mixed-case name lowercased. + ("SET Search_Path = public, extensions", "search_path", "public, extensions"), + ], +) +def test_classify_command_form_set_with_comma_values_captures( + sql: str, expected_name: str, expected_value: str, +) -> None: + """`SET search_path = a, b` falls back to sqlglot's Command form (the + comma-list parser doesn't recognise multi-value SET yet). The classifier + must still capture (name, raw-value-text) so the connection persists it. + """ + # Run under the postgres dialect; the dialect-less parser yields a + # different shape for this Command form. + result = translate(sql=sql, catalog=_catalog(), dialect="postgres") + assert isinstance(result, NoOpResult) + assert result.command_tag == "SET" + assert result.set_setting == SetSettingOp(name=expected_name, value=expected_value) + assert result.reset_setting is None + + +@pytest.mark.parametrize( + ("sql", "expected_name", "expected_value"), + [ + # Dotted "custom GUC" names — apps and PG extensions use `myapp.user_id`. + # sqlglot parses these as Column(table=my, name=custom); we reconstruct + # the dotted name so SHOW round-trips. + ("SET myapp.user_id = '42'", "myapp.user_id", "42"), + ("SET myapp.User_Id = '42'", "myapp.user_id", "42"), # lowercased + # 3-part dotted name — `Column.parts` walks the full chain. + # Codex round 4 F2. + ("SET my.app.user_id = '42'", "my.app.user_id", "42"), + ], +) +def test_classify_set_dotted_custom_name_captures( + sql: str, expected_name: str, expected_value: str, dialect, +) -> None: + """`SET myapp.user_id = '42'` parses as a multi-part Column; preserve the + dotted form so SHOW myapp.user_id round-trips.""" + result = translate(sql=sql, catalog=_catalog(), dialect=dialect) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name=expected_name, value=expected_value, + ) + + +def test_classify_set_cast_wrapped_rhs_captures(dialect) -> None: + """`SET application_name = 'foo'::text` — after extended-protocol bind + substitution of `$1::text`, the rhs is wrapped in exp.Cast. Peer through + one Cast level the same way set_config does. Codex round 4 F3.""" + result = translate( + sql="SET application_name = 'foo'::text", + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name="application_name", value="foo", + ) + + +@pytest.mark.parametrize( + ("sql", "expected_name", "expected_value"), + [ + ("SET extra_float_digits = -1", "extra_float_digits", "-1"), + ("SET seq_page_cost = -0.5", "seq_page_cost", "-0.5"), + ("SET x = +5", "x", "5"), + ], +) +def test_classify_set_signed_numeric_value_captures( + sql: str, expected_name: str, expected_value: str, dialect, +) -> None: + """Signed numeric SET values (`-1`, `-0.5`) parse as `exp.Neg(Literal)`; + `_extract_setting_value` must recognise the prefix. Codex round 5 F2.""" + result = translate(sql=sql, catalog=_catalog(), dialect=dialect) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name=expected_name, value=expected_value, + ) + + +def test_classify_command_form_set_dotted_name_captures() -> None: + """`SET myapp.user_id = public, extensions` falls into Command-form + (comma-list value), but the name regex must allow dotted names so the + classifier is consistent with the exp.Set path's lhs.parts + reconstruction. Codex round 5 F3.""" + result = translate( + sql="SET myapp.user_id = public, extensions", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name="myapp.user_id", value="public, extensions", + ) + + +def test_classify_command_form_set_preserves_quoted_internal_whitespace() -> None: + """`SET x = "foo bar"` — internal whitespace inside the captured value + must survive the separator-detection whitespace normalisation. Codex + round 4 F1.""" + result = translate( + sql='SET search_path = "foo bar", public', + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, NoOpResult) + # The value should preserve the triple-space inside the quoted token. + assert result.set_setting is not None + assert result.set_setting.name == "search_path" + assert "foo bar" in result.set_setting.value + + +def test_classify_command_form_set_with_tab_whitespace_captures() -> None: + """Tab-separated `SET search_path\\tTO\\tpublic` (and other non-space + SQL whitespace around TO) must still capture. Round-2 regex caught + these via \\s+; the round-3 string-ops rewrite must too. + Codex round 3 minor.""" + result = translate( + sql="SET search_path\tTO\tpublic, extensions", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name="search_path", value="public, extensions", + ) + + +def test_classify_command_form_set_to_keyword_captures(dialect) -> None: + """`SET TO ` (TO instead of =) — same Command-form + extraction.""" + result = translate( + sql="SET search_path TO public, extensions", + catalog=_catalog(), dialect="postgres", + ) + assert isinstance(result, NoOpResult) + assert result.set_setting == SetSettingOp( + name="search_path", value="public, extensions", + ) + + +@pytest.mark.parametrize( + ("sql", "expected_name", "expected_reset_all"), + [ + ("RESET application_name", "application_name", False), + ("RESET Application_Name", "application_name", False), # lowercased + ("RESET ALL", None, True), + ("RESET all", None, True), # case-insensitive ALL keyword + ], +) +def test_classify_reset_populates_reset_setting( + sql: str, expected_name, expected_reset_all: bool, +) -> None: + """RESET is a Postgres-ism — only parses to exp.Command under the + `postgres` dialect; the dialect-less parser treats `RESET ` as + an Alias node. We restrict the test to the dialect that actually + sees RESET traffic.""" + result = translate(sql=sql, catalog=_catalog(), dialect="postgres") + assert isinstance(result, NoOpResult) + assert result.command_tag == "RESET" + assert result.set_setting is None + assert result.reset_setting == ResetSettingOp( + name=expected_name, reset_all=expected_reset_all, + ) + + +def test_classify_bare_reset_has_no_setting_capture() -> None: + """`RESET` with no argument acknowledges silently with no setting capture + (defensive — drivers don't emit this in practice). PG-only.""" + result = translate(sql="RESET", catalog=_catalog(), dialect="postgres") + assert isinstance(result, NoOpResult) + assert result.command_tag == "RESET" + assert result.set_setting is None + assert result.reset_setting is None + + +def test_classify_begin_commit_rollback_have_no_setting_capture(dialect) -> None: + """Transaction control commands populate command_tag but not set/reset + setting fields.""" + for sql, tag in [ + ("BEGIN", "BEGIN"), + ("COMMIT", "COMMIT"), + ("ROLLBACK", "ROLLBACK"), + ]: + result = translate(sql=sql, catalog=_catalog(), dialect=dialect) + assert isinstance(result, NoOpResult) + assert result.command_tag == tag + assert result.set_setting is None + assert result.reset_setting is None + + +def _row_batch(name: str, value: str) -> RowBatch: + from slayer.core.enums import DataType + return RowBatch( + columns=[FacadeColumn(name=name, type=DataType.TEXT)], + rows=[{name: value}], + ) + + +def test_probe_matcher_can_return_outcome_with_settings_mutation(dialect) -> None: + """When a probe matcher returns a ``ProbeMatcherOutcome`` (rather than the + legacy bare ``RowBatch``), the mutation is tunneled through to the + ``ProbeResult.settings_mutation`` field. The PG facade uses this to + apply `set_config()` mutations on Execute (but not Describe). + """ + captured = {"name": "application_name", "value": "foo"} + + def matcher(parsed) -> ProbeMatcherOutcome | None: + return ProbeMatcherOutcome( + batch=_row_batch("set_config", captured["value"]), + settings_mutation=SetSettingOp( + name=captured["name"], value=captured["value"], + ), + ) + + result = translate( + sql="SELECT set_config('application_name', 'foo', false)", + catalog=_catalog(), dialect=dialect, probe_matcher=matcher, + ) + assert isinstance(result, ProbeResult) + assert result.batch.rows == [{"set_config": "foo"}] + assert result.settings_mutation == SetSettingOp( + name="application_name", value="foo", + ) + + +def test_probe_matcher_returning_bare_row_batch_still_works(dialect) -> None: + """Backwards compatibility: matchers that return a ``RowBatch`` directly + (the Flight default ``match_probe`` shape) produce a ``ProbeResult`` + with no mutation. Required for the shared facade not to break Flight.""" + def matcher(parsed) -> RowBatch | None: + return _row_batch("ok", "1") + + result = translate( + sql="SELECT 1", catalog=_catalog(), dialect=dialect, probe_matcher=matcher, + ) + assert isinstance(result, ProbeResult) + assert result.batch.rows == [{"ok": "1"}] + assert result.settings_mutation is None + + +def test_probe_result_settings_mutation_defaults_none(dialect) -> None: + """When the default probe matcher (Flight) handles a probe like + `SELECT 1`, no mutation is attached.""" + result = translate(sql="SELECT 1", catalog=_catalog(), dialect=dialect) + assert isinstance(result, ProbeResult) + assert result.settings_mutation is None + + +def test_set_setting_op_is_pydantic_model() -> None: + """SetSettingOp must be a Pydantic BaseModel (frozen-ish; equal by value). + Per project convention — never dataclasses.""" + from pydantic import BaseModel + assert issubclass(SetSettingOp, BaseModel) + assert SetSettingOp(name="x", value="y") == SetSettingOp(name="x", value="y") + assert SetSettingOp(name="x", value="y") != SetSettingOp(name="x", value="z") + + +def test_reset_setting_op_is_pydantic_model() -> None: + from pydantic import BaseModel + assert issubclass(ResetSettingOp, BaseModel) + assert ResetSettingOp(reset_all=True) == ResetSettingOp(reset_all=True) + assert ( + ResetSettingOp(name="x", reset_all=False) + != ResetSettingOp(reset_all=True) + ) + + + + + # --- table resolution -------------------------------------------------------- @@ -431,6 +779,203 @@ def test_order_by_aggregate_expression_resolves(dialect) -> None: assert result.query.order[0].direction == "desc" +# --- DEV-1568: MBQL aggregation-alias refs in WHERE/HAVING/ORDER BY ---------- +# +# Metabase compiles MBQL `["aggregation", N]` post-aggregation references to +# SQL that names the projection alias directly: +# * WHERE filter: WHERE "" (ungrammatical pre-GROUP-BY +# ref, but the live wire +# capture shows this exact +# shape — Metabase emits it +# as WHERE, not HAVING) +# * HAVING filter: HAVING "" (covered for symmetry; not +# observed on the wire but +# cheap to support) +# * ORDER BY: ORDER BY "" (alias-ref, NOT the +# catalog-side FacadeMetric.name) +# +# The translator must connect the alias back to the catalog aggregate's +# `measure_formula` (for filters) or `projected_name` (for OrderItem.column), +# NOT the catalog-side `FacadeMetric.name`. + + +def test_where_on_aggregate_alias_for_count_star_resolves(dialect) -> None: + result = translate( + sql='SELECT status, COUNT(*) AS "count" FROM orders ' + 'WHERE "count" > 1 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["*:count > 1"] + + +def test_where_on_aggregate_alias_literal_on_left_flips(dialect) -> None: + result = translate( + sql='SELECT status, COUNT(*) AS "count" FROM orders ' + 'WHERE 1 < "count" GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["*:count > 1"] + + +def test_where_on_aggregate_alias_for_sum_resolves(dialect) -> None: + result = translate( + sql='SELECT status, SUM(revenue) AS "rev" FROM orders ' + 'WHERE "rev" > 1000 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["revenue:sum > 1000"] + + +def test_having_on_aggregate_alias_resolves(dialect) -> None: + result = translate( + sql='SELECT status, SUM(revenue) AS "rev" FROM orders ' + 'GROUP BY status HAVING "rev" > 1000', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["revenue:sum > 1000"] + + +def test_order_by_aggregate_alias_for_count_star(dialect) -> None: + """Metabase shape: COUNT(*) aliased, ORDER BY references the alias. + + Pre-fix the translator built ``ColumnRef(name=item.metric.name)`` — + i.e. ``ColumnRef("row_count")`` — but ``plan.measures`` registered the + SLayer measure under ``projected_name`` (``"count"``), so the engine + failed with ``Referenced column "orders.row_count" not found``. + """ + result = translate( + sql='SELECT status, COUNT(*) AS "count" FROM orders ' + 'GROUP BY status ORDER BY "count" DESC', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.name == "count" + assert result.query.order[0].direction == "desc" + + +def test_order_by_aggregate_call_with_alias_uses_projected_name(dialect) -> None: + """ORDER BY repeats the aggregate-call literally, but the projection is + aliased — the OrderItem must use the user alias, not ``metric.name``.""" + result = translate( + sql='SELECT SUM(revenue) AS "rev" FROM orders ORDER BY SUM(revenue) DESC', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.order is not None + assert result.query.order[0].column.name == "rev" + assert result.query.order[0].direction == "desc" + + +def test_where_on_aggregate_alias_gte_comparator_resolves(dialect) -> None: + """Regression guard: alias detection must work for every comparator, not + just ``>``/``<``. Covers Codex review #4.""" + result = translate( + sql='SELECT status, COUNT(*) AS "count" FROM orders ' + 'WHERE "count" >= 5 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["*:count >= 5"] + + +def test_where_on_aggregate_alias_count_distinct_resolves(dialect) -> None: + """Confirms the colon-form emission uses the catalog's pre-expanded + ``measure_formula`` (which canonicalises COUNT(DISTINCT col) → + ``col:count_distinct``) rather than a hand-built form. Covers Codex #6.""" + result = translate( + sql='SELECT status, COUNT(DISTINCT id) AS "uniq" FROM orders ' + 'WHERE "uniq" > 1 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["id:count_distinct > 1"] + + +def test_where_aggregate_alias_against_non_literal_raises(dialect) -> None: + """An aggregate-alias compared to ANYTHING other than a literal must + raise — the verbatim fallback would silently emit broken SQL. Covers + Codex review #2.""" + with pytest.raises(TranslationError): + translate( + sql='SELECT status, COUNT(*) AS "count", SUM(revenue) AS "rev" ' + 'FROM orders WHERE "count" > "rev" GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + + +def test_where_aggregate_alias_match_is_case_sensitive(dialect) -> None: + """Lookup against ``items_by_projected_name`` is case-sensitive (mirrors + the existing ORDER BY alias-lookup behaviour). A WHERE conjunct whose + column-name case differs from the projection alias must NOT route through + the colon-form path — it falls through to verbatim, with the user's + original column-name case preserved. Covers Codex #3.""" + result = translate( + sql='SELECT status, COUNT(*) AS "count" FROM orders ' + 'WHERE "Count" > 1 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + filters = result.query.filters or [] + # Did NOT route through the colon-form path… + assert "*:count > 1" not in filters + # …and DID fall through to the verbatim branch, preserving "Count". + assert len(filters) == 1 + assert '"Count"' in filters[0] + + +def test_having_aggregate_alias_against_non_literal_raises(dialect) -> None: + """HAVING symmetry of the WHERE raise rule. Covers Codex #1.""" + with pytest.raises(TranslationError): + translate( + sql='SELECT status, COUNT(*) AS "count", SUM(revenue) AS "rev" ' + 'FROM orders GROUP BY status HAVING "count" > "rev"', + catalog=_catalog(), dialect=dialect, + ) + + +@pytest.mark.parametrize("alias_form", [ + '"orders"."count"', # 2-part: table-qualified + '"public"."orders"."count"', # 3-part: schema.table-qualified (pg-facade) +]) +def test_where_on_qualified_aggregate_alias_resolves(alias_form, dialect) -> None: + """``strip_prefix`` must drop the FROM-table qualifier so a 2-part or + 3-part column ref to the aggregate alias resolves identically to the + bare form. Covers Codex #2.""" + result = translate( + sql=f'SELECT status, COUNT(*) AS "count" FROM orders ' + f'WHERE {alias_form} > 1 GROUP BY status', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + assert result.query.filters == ["*:count > 1"] + + +def test_where_on_dimension_alias_does_not_route_to_aggregate_path(dialect) -> None: + """The alias-detection helper is guarded on ``item.metric is not None``. + A dimension projection aliased to a name that would otherwise match must + fall through to the existing verbatim filter path, NOT raise (the + "must be a literal" rule applies only to aggregate-alias matches). + Covers Codex #3.""" + # status (dim) aliased to "count"; WHERE "count" = 'x' is a dim equality. + # Pre-fix and post-fix both fall through to verbatim because metric is None. + result = translate( + sql='SELECT status AS "count" FROM orders WHERE "count" = \'x\'', + catalog=_catalog(), dialect=dialect, + ) + assert isinstance(result, QueryResult) + filters = result.query.filters or [] + # Did NOT route through the colon-form path… + assert "*:count = 'x'" not in filters + # …and DID fall through to the verbatim path. + assert len(filters) == 1 + + + # --- time-grain wrapping ----------------------------------------------------- @@ -551,6 +1096,45 @@ def test_one_day_offset_on_non_week_is_preserved(dialect) -> None: ) +def test_sunday_week_wrapper_two_day_offset_rejected(dialect) -> None: + """The Sunday-week detector matches ONLY a one-day shift on EACH leg. A + two-day shift on either leg is a different bucketing transform (not + Metabase's wrapper) and must keep raising rather than collapse to + WEEK_SUNDAY. Exercises the exactly-one-day guard on both the inner + (column-side) and the outer (result-side) shift independently. + """ + # Inner +2 day (outer -1 day intact) — inner leg is not one day. + with pytest.raises(TranslationError): + translate( + sql=( + 'SELECT CAST((CAST(date_trunc(\'week\', ' + '("orders"."ordered_at" + INTERVAL \'2 day\')) AS DATE) ' + '+ INTERVAL \'-1 day\') AS DATE) AS "ordered_at", ' + 'COUNT(*) AS "count" FROM "orders" ' + 'GROUP BY CAST((CAST(date_trunc(\'week\', ' + '("orders"."ordered_at" + INTERVAL \'2 day\')) AS DATE) ' + '+ INTERVAL \'-1 day\') AS DATE)' + ), + catalog=_catalog(), dialect=dialect, + ) + # Inner +1 day intact, outer -2 day — outer leg is not one day. + with pytest.raises(TranslationError): + translate( + sql=( + 'SELECT CAST((CAST(date_trunc(\'week\', ' + '("orders"."ordered_at" + INTERVAL \'1 day\')) AS DATE) ' + '+ INTERVAL \'-2 day\') AS DATE) AS "ordered_at", ' + 'COUNT(*) AS "count" FROM "orders" ' + 'GROUP BY CAST((CAST(date_trunc(\'week\', ' + '("orders"."ordered_at" + INTERVAL \'1 day\')) AS DATE) ' + '+ INTERVAL \'-2 day\') AS DATE)' + ), + catalog=_catalog(), dialect=dialect, + ) + + + + def test_partial_sunday_week_wrapper_is_rejected(dialect) -> None: """The Sunday-week unwrap requires BOTH the outer ``-1 day`` shift and the inner ``+1 day`` shift to be present together. Half a wrapper is @@ -599,6 +1183,11 @@ def test_outer_week_day_offset_direction_aware(dialect) -> None: ) + + + + + # --- dialect-only parse acceptance ------------------------------------------ From 434a21c1e565d53ec7b14f17d9cc37cccc5aeef3 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 18:42:50 +0200 Subject: [PATCH 12/13] DEV-1566 PR #185 round 10: drop silently-coercive DECIMAL/NUMERIC/TIMESTAMPTZ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round 9 (against origin/main): _SQLGLOT_TYPE_TO_DATATYPE silently collapsed `DECIMAL`/`NUMERIC` onto `DataType.DOUBLE` and `TIMESTAMPTZ` / `TIMESTAMP WITH TIME ZONE` onto `DataType.TIMESTAMP`. PostgreSQL exposes those as OID 1700 (`numeric`, exact-precision) and OID 1184 (`timestamptz`, TZ-aware) respectively — distinct from the OID 701 / 1114 the coerced mapping advertised. Clients requesting NUMERIC got `float8` on the wire (potential precision loss); clients requesting TIMESTAMPTZ got `timestamp` (no-TZ wire type AND no timezone conversion in the encoder). Drop the three entries (DECIMAL, TIMESTAMPTZ, TIMESTAMPLTZ — NUMERIC was already canonicalised to DECIMAL by sqlglot). The targets now fall through to the existing "Unsupported projection expression" error, mirroring UUID/JSON/ARRAY/STRUCT. Tests: - The existing parametrised alias-collapse test loses the three coercive rows. - New `test_cast_exact_or_timezone_aware_types_rejected` pins the new rejection for DECIMAL/NUMERIC/TIMESTAMPTZ with the unsupported-projection expression error. Docs: pg-facade.md's "Unsupported target types" sentence now names DECIMAL/ NUMERIC and TIMESTAMPTZ explicitly with the OID / wire-type rationale. Co-Authored-By: Claude Opus 4.7 --- docs/interfaces/pg-facade.md | 12 +++++++++-- slayer/facade/translator.py | 22 ++++++++++++++------ tests/facade/test_translator.py | 36 ++++++++++++++++++++++++--------- 3 files changed, 52 insertions(+), 18 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index b57ffdba..3c221f25 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -216,8 +216,16 @@ Admitted (source, target) coercions: Pairs outside the allowlist (e.g. `CAST(name AS INT)`, `CAST(amount AS BOOLEAN)`) raise `Unsupported CAST: cannot project column as (...). Admitted -coercions: see docs/interfaces/pg-facade.md.` Unsupported target types (`UUID`, `JSON`, -`ARRAY`, `STRUCT`, …) raise the standard `Unsupported projection expression` error. +coercions: see docs/interfaces/pg-facade.md.` Unsupported target types +(`UUID`, `JSON`, `ARRAY`, `STRUCT`, `DECIMAL`/`NUMERIC`, `TIMESTAMPTZ` / +`TIMESTAMP WITH TIME ZONE`, …) raise the standard `Unsupported projection +expression` error. `DECIMAL`/`NUMERIC` and `TIMESTAMPTZ` are deliberately +NOT admitted via a DOUBLE / TIMESTAMP coercion — Postgres clients reading +the projected column would receive OID 701 (`float8`) or OID 1114 +(`timestamp`, no-TZ) when they asked for OID 1700 (`numeric`, exact-precision) +or OID 1184 (`timestamptz`, TZ-aware), losing precision and TZ semantics. +Until SLayer adds native `NUMERIC` and `TIMESTAMPTZ` data types with their +own wire encoders, the only safe answer is to reject. `DOUBLE → INT` is intentionally excluded: Python's `int()` truncates toward zero while Postgres rounds half-to-even, so silently admitting the pair would diverge from diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index ca150150..99b6be32 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -285,9 +285,22 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: # projection support. Aliases sqlglot canonicalises at parse time (STRING→TEXT, # INTEGER→INT, NUMERIC→DECIMAL, BOOL→BOOLEAN, FLOAT→DOUBLE) need no entry; # REAL parses to exp.DataType.Type.FLOAT (not normalised), so DOUBLE coverage -# routes through here. Parameterised forms (VARCHAR(255), DECIMAL(10,2)) -# collapse onto the same base member at parse time — precision is dropped at -# the SLayer boundary because SLayer wire types don't carry it. +# routes through here. Parameterised forms (VARCHAR(255)) collapse onto the +# same base member at parse time — precision is dropped at the SLayer +# boundary because SLayer wire types don't carry it. +# +# Codex round 8 — silently-coercive targets DROPPED: +# * DECIMAL/NUMERIC: PostgreSQL exposes these as OID 1700 (`numeric`, +# exact-precision). DataType.DOUBLE would advertise OID 701 (`float8`) +# and round-trip via the float encoder — wrong wire type, potential +# precision loss. Until SLayer adds a NUMERIC DataType + OID 1700 + +# decimal encoder, the only safe answer is to reject. +# * TIMESTAMPTZ / TIMESTAMP WITH TIME ZONE / TIMESTAMPLTZ: OID 1184 with +# timezone-aware client decoding. DataType.TIMESTAMP would advertise +# OID 1114 (`timestamp`, no-TZ) — wrong wire type AND wrong timezone +# semantics. Reject until proper TIMESTAMPTZ support lands. +# These pairs now fall through to the "Unsupported projection expression" +# error, same as UUID/JSON/ARRAY/STRUCT. _SQLGLOT_TYPE_TO_DATATYPE: Dict[exp.DataType.Type, DataType] = { exp.DataType.Type.TEXT: DataType.TEXT, exp.DataType.Type.VARCHAR: DataType.TEXT, @@ -301,13 +314,10 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: exp.DataType.Type.MEDIUMINT: DataType.INT, exp.DataType.Type.DOUBLE: DataType.DOUBLE, exp.DataType.Type.FLOAT: DataType.DOUBLE, - exp.DataType.Type.DECIMAL: DataType.DOUBLE, exp.DataType.Type.BOOLEAN: DataType.BOOLEAN, exp.DataType.Type.DATE: DataType.DATE, exp.DataType.Type.TIMESTAMP: DataType.TIMESTAMP, exp.DataType.Type.DATETIME: DataType.TIMESTAMP, - exp.DataType.Type.TIMESTAMPTZ: DataType.TIMESTAMP, - exp.DataType.Type.TIMESTAMPLTZ: DataType.TIMESTAMP, } # DEV-1566: per-pair allowlist of CAST coercions admitted in projection. diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 2545cae6..85bb8f96 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1691,22 +1691,20 @@ def test_cast_text_target_admitted_from_every_source(col: str, dialect) -> None: ("INTEGER", DataType.INT), ("BIGINT", DataType.INT), ("SMALLINT", DataType.INT), - # DOUBLE-family aliases (floating/decimal collapse to DOUBLE). + # DOUBLE-family aliases (floating types collapse to DOUBLE; DECIMAL / + # NUMERIC are deliberately NOT here — see Codex round 8 carve-out). ("FLOAT", DataType.DOUBLE), ("REAL", DataType.DOUBLE), - ("DECIMAL", DataType.DOUBLE), - ("NUMERIC", DataType.DOUBLE), - # TIMESTAMP-family aliases. + # TIMESTAMP-family aliases — TIMESTAMPTZ deliberately NOT here. ("DATETIME", DataType.TIMESTAMP), - ("TIMESTAMPTZ", DataType.TIMESTAMP), ], ) def test_cast_sqlglot_type_aliases_map_to_slayer_datatype( type_alias: str, expected: DataType, ) -> None: """Each accepted sqlglot DataType.Type alias collapses onto the canonical - SLayer DataType. Pinned under postgres dialect since some aliases - (TIMESTAMPTZ) only parse cleanly there.""" + SLayer DataType. Pinned under postgres dialect since some aliases only + parse cleanly there.""" # Pick a source column whose declared type makes an # admitted pair: identity on the expected canonical type. source_col = { @@ -1723,10 +1721,28 @@ def test_cast_sqlglot_type_aliases_map_to_slayer_datatype( assert result.projection_types == [expected] +@pytest.mark.parametrize("type_alias", ["DECIMAL", "NUMERIC", "TIMESTAMPTZ"]) +def test_cast_exact_or_timezone_aware_types_rejected(type_alias: str) -> None: + """Codex round 8: ``DECIMAL`` / ``NUMERIC`` and ``TIMESTAMPTZ`` were + silently coerced into ``DataType.DOUBLE`` / ``DataType.TIMESTAMP`` + respectively. PostgreSQL exposes them as OID 1700 (`numeric`, exact) + and OID 1184 (`timestamptz`, TZ-aware) — distinct from the OID 701 / + 1114 the coerced mapping would advertise — so clients got the wrong + wire type. Pinned as rejected (falls through to the unsupported- + projection-expression error) until SLayer adds native NUMERIC and + TIMESTAMPTZ types + OIDs + encoders.""" + with pytest.raises(TranslationError) as exc_info: + translate( + sql=f"SELECT CAST(revenue AS {type_alias}) FROM orders", + catalog=_catalog(), dialect="postgres", + ) + assert "Unsupported projection expression" in str(exc_info.value) + + def test_cast_parameterised_type_form_works() -> None: - """sqlglot represents ``VARCHAR(255)`` / ``DECIMAL(10,2)`` etc. with their - precision modifier on the SAME ``DataType.Type`` member, so the mapping - collapses precision implicitly — SLayer wire types don't carry it.""" + """sqlglot represents ``VARCHAR(255)`` etc. with their precision modifier + on the SAME ``DataType.Type`` member, so the mapping collapses precision + implicitly — SLayer wire types don't carry it.""" result = translate( sql="SELECT CAST(status AS VARCHAR(255)) FROM orders", catalog=_catalog(), dialect="postgres", From 8c489e9c3357d6536766f0f724fc73b610932dc6 Mon Sep 17 00:00:00 2001 From: Egor Kraev Date: Thu, 18 Jun 2026 22:37:09 +0200 Subject: [PATCH 13/13] DEV-1566 PR #185 round 11: revert round-10 rejections, document coarse OID mapping MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 10's rejection of DECIMAL/NUMERIC/TIMESTAMPTZ was over-cautious: - SLayer's engine has no exact NUMERIC type, so there's no precision to lose downstream of a DECIMAL cast. - The pg-facade encoder is OID-driven — the wire bytes always match the OID we advertise (no internal mismatch). - The only divergence is that the advertised OID is broader than what the user typed (`float8` vs `numeric`, `timestamp` vs `timestamptz`, `int8` vs `int2/4`). That's a coarse-mapping caveat, not corruption. Re-add DECIMAL, TIMESTAMPTZ, TIMESTAMPLTZ, SMALLINT, TINYINT, MEDIUMINT to `_SQLGLOT_TYPE_TO_DATATYPE`. Existing `INTEGER`/`INT` mapping (which also collapses onto OID 20 `int8`) was already documented implicitly via the codebase but is now explicit in the docs table. Source comment block rewritten to describe the coarsening positively ("CAST is a coarse wire-OID hint") rather than enumerating "rejected" targets. Tests: revert `test_cast_sqlglot_type_aliases_map_to_slayer_datatype` to the round-9 parametrisation (DECIMAL/NUMERIC/TIMESTAMPTZ rows restored). Round-10's `test_cast_exact_or_timezone_aware_types_rejected` removed. Docs: replace the round-10 "Unsupported target types" paragraph with a new "CAST coarse-OID mapping" subsection that explicitly tables each collapse (including the pre-existing INTEGER->int8 coarsening, per Egor's "ditto for decimal" — every coarse case is documented in the same table). Co-Authored-By: Claude Opus 4.7 --- docs/interfaces/pg-facade.md | 46 ++++++++++++++++++++++++++------- slayer/facade/translator.py | 36 +++++++++++++++----------- tests/facade/test_translator.py | 35 ++++++++----------------- 3 files changed, 68 insertions(+), 49 deletions(-) diff --git a/docs/interfaces/pg-facade.md b/docs/interfaces/pg-facade.md index 3c221f25..fb277c6b 100644 --- a/docs/interfaces/pg-facade.md +++ b/docs/interfaces/pg-facade.md @@ -216,16 +216,42 @@ Admitted (source, target) coercions: Pairs outside the allowlist (e.g. `CAST(name AS INT)`, `CAST(amount AS BOOLEAN)`) raise `Unsupported CAST: cannot project column as (...). Admitted -coercions: see docs/interfaces/pg-facade.md.` Unsupported target types -(`UUID`, `JSON`, `ARRAY`, `STRUCT`, `DECIMAL`/`NUMERIC`, `TIMESTAMPTZ` / -`TIMESTAMP WITH TIME ZONE`, …) raise the standard `Unsupported projection -expression` error. `DECIMAL`/`NUMERIC` and `TIMESTAMPTZ` are deliberately -NOT admitted via a DOUBLE / TIMESTAMP coercion — Postgres clients reading -the projected column would receive OID 701 (`float8`) or OID 1114 -(`timestamp`, no-TZ) when they asked for OID 1700 (`numeric`, exact-precision) -or OID 1184 (`timestamptz`, TZ-aware), losing precision and TZ semantics. -Until SLayer adds native `NUMERIC` and `TIMESTAMPTZ` data types with their -own wire encoders, the only safe answer is to reject. +coercions: see docs/interfaces/pg-facade.md.` Unsupported target types (`UUID`, +`JSON`, `ARRAY`, `STRUCT`, …) raise the standard `Unsupported projection +expression` error. + +#### CAST coarse-OID mapping + +CAST is a **coarse wire-OID hint**, not a precision-preserving conversion. +The SLayer engine projects the bare column unchanged; the pg-facade encoder +is OID-driven, so the wire bytes always match the OID we advertise. Some +PostgreSQL types the user can write in a CAST don't have a one-to-one +SLayer equivalent — those collapse onto the nearest broader SLayer type: + +| User wrote in `CAST(... AS X)` | SLayer maps to | Wire OID advertised | +|---|---|---| +| `INTEGER` / `INT` (pre-existing) | `DataType.INT` | 20 (`int8`) — not 23 (`int4`) | +| `SMALLINT` | `DataType.INT` | 20 (`int8`) — not 21 (`int2`) | +| `TINYINT` / `MEDIUMINT` (non-Postgres widths) | `DataType.INT` | 20 (`int8`) | +| `BIGINT` | `DataType.INT` | 20 (`int8`) ✓ exact match | +| `DECIMAL` / `NUMERIC` | `DataType.DOUBLE` | 701 (`float8`) — not 1700 (`numeric`) | +| `FLOAT` / `REAL` / `DOUBLE` | `DataType.DOUBLE` | 701 (`float8`) ✓ | +| `TIMESTAMPTZ` / `TIMESTAMP WITH TIME ZONE` | `DataType.TIMESTAMP` | 1114 (`timestamp`, no TZ) — not 1184 (`timestamptz`) | +| `TIMESTAMP` / `DATETIME` | `DataType.TIMESTAMP` | 1114 (`timestamp`) ✓ | + +What this means in practice: + +- The wire bytes the client receives are always consistent with the OID we + advertise (the encoder picks the binary/text form from the OID). There is + no value corruption. +- The OID is potentially broader than what the user typed. A client that + asked for `NUMERIC` and got `float8` sees a float on the wire and decodes + it correctly as a float — but loses the "exact precision" expectation. + A client that asked for `TIMESTAMPTZ` sees naive `timestamp` bytes — and + loses TZ-aware decoding semantics. +- Callers needing exact `NUMERIC` precision, narrow integer wire widths, or + TZ-aware timestamps must compute upstream (or wait for SLayer to model + those types natively). `DOUBLE → INT` is intentionally excluded: Python's `int()` truncates toward zero while Postgres rounds half-to-even, so silently admitting the pair would diverge from diff --git a/slayer/facade/translator.py b/slayer/facade/translator.py index 99b6be32..1b42deeb 100644 --- a/slayer/facade/translator.py +++ b/slayer/facade/translator.py @@ -285,22 +285,25 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: # projection support. Aliases sqlglot canonicalises at parse time (STRING→TEXT, # INTEGER→INT, NUMERIC→DECIMAL, BOOL→BOOLEAN, FLOAT→DOUBLE) need no entry; # REAL parses to exp.DataType.Type.FLOAT (not normalised), so DOUBLE coverage -# routes through here. Parameterised forms (VARCHAR(255)) collapse onto the -# same base member at parse time — precision is dropped at the SLayer -# boundary because SLayer wire types don't carry it. +# routes through here. Parameterised forms (VARCHAR(255), DECIMAL(10,2)) +# collapse onto the same base member at parse time — precision is dropped at +# the SLayer boundary because SLayer wire types don't carry it. # -# Codex round 8 — silently-coercive targets DROPPED: -# * DECIMAL/NUMERIC: PostgreSQL exposes these as OID 1700 (`numeric`, -# exact-precision). DataType.DOUBLE would advertise OID 701 (`float8`) -# and round-trip via the float encoder — wrong wire type, potential -# precision loss. Until SLayer adds a NUMERIC DataType + OID 1700 + -# decimal encoder, the only safe answer is to reject. -# * TIMESTAMPTZ / TIMESTAMP WITH TIME ZONE / TIMESTAMPLTZ: OID 1184 with -# timezone-aware client decoding. DataType.TIMESTAMP would advertise -# OID 1114 (`timestamp`, no-TZ) — wrong wire type AND wrong timezone -# semantics. Reject until proper TIMESTAMPTZ support lands. -# These pairs now fall through to the "Unsupported projection expression" -# error, same as UUID/JSON/ARRAY/STRUCT. +# CAST is a COARSE wire-OID hint, not a precision-preserving conversion. The +# SLayer engine projects the bare column unchanged; the pg-facade encoder is +# OID-driven, so the wire bytes always match the OID we advertise. The only +# "mismatch" exposed by the coarsenings below is that the advertised OID is +# broader than what the user typed: +# +# * DECIMAL/NUMERIC → DataType.DOUBLE (OID 701 `float8`, not 1700 `numeric`). +# * INTEGER / SMALLINT / TINYINT / MEDIUMINT → DataType.INT +# (OID 20 `int8`, not 23/21). +# * TIMESTAMPTZ / TIMESTAMP WITH TIME ZONE / TIMESTAMPLTZ → DataType.TIMESTAMP +# (OID 1114 `timestamp`, no TZ semantics). +# +# Callers needing exact NUMERIC precision, narrow integer wire widths, or +# TZ-aware timestamps must compute upstream. See pg-facade.md +# §"CAST coarse-OID mapping" for the user-facing table. _SQLGLOT_TYPE_TO_DATATYPE: Dict[exp.DataType.Type, DataType] = { exp.DataType.Type.TEXT: DataType.TEXT, exp.DataType.Type.VARCHAR: DataType.TEXT, @@ -314,10 +317,13 @@ def __init__(self, message: str, *, status: str = "INVALID_ARGUMENT") -> None: exp.DataType.Type.MEDIUMINT: DataType.INT, exp.DataType.Type.DOUBLE: DataType.DOUBLE, exp.DataType.Type.FLOAT: DataType.DOUBLE, + exp.DataType.Type.DECIMAL: DataType.DOUBLE, exp.DataType.Type.BOOLEAN: DataType.BOOLEAN, exp.DataType.Type.DATE: DataType.DATE, exp.DataType.Type.TIMESTAMP: DataType.TIMESTAMP, exp.DataType.Type.DATETIME: DataType.TIMESTAMP, + exp.DataType.Type.TIMESTAMPTZ: DataType.TIMESTAMP, + exp.DataType.Type.TIMESTAMPLTZ: DataType.TIMESTAMP, } # DEV-1566: per-pair allowlist of CAST coercions admitted in projection. diff --git a/tests/facade/test_translator.py b/tests/facade/test_translator.py index 85bb8f96..8b977444 100644 --- a/tests/facade/test_translator.py +++ b/tests/facade/test_translator.py @@ -1687,24 +1687,29 @@ def test_cast_text_target_admitted_from_every_source(col: str, dialect) -> None: # TEXT-family aliases. ("VARCHAR", DataType.TEXT), ("CHAR", DataType.TEXT), - # INT-family aliases. + # INT-family aliases (narrowing widths collapse to DataType.INT — see + # the coarse-OID note in pg-facade.md). ("INTEGER", DataType.INT), ("BIGINT", DataType.INT), ("SMALLINT", DataType.INT), - # DOUBLE-family aliases (floating types collapse to DOUBLE; DECIMAL / - # NUMERIC are deliberately NOT here — see Codex round 8 carve-out). + # DOUBLE-family aliases (floating + DECIMAL/NUMERIC all collapse to + # DataType.DOUBLE — same coarse-OID note). ("FLOAT", DataType.DOUBLE), ("REAL", DataType.DOUBLE), - # TIMESTAMP-family aliases — TIMESTAMPTZ deliberately NOT here. + ("DECIMAL", DataType.DOUBLE), + ("NUMERIC", DataType.DOUBLE), + # TIMESTAMP-family aliases (TIMESTAMPTZ / TIMESTAMP WITH TIME ZONE + # collapse to DataType.TIMESTAMP — same coarse-OID note). ("DATETIME", DataType.TIMESTAMP), + ("TIMESTAMPTZ", DataType.TIMESTAMP), ], ) def test_cast_sqlglot_type_aliases_map_to_slayer_datatype( type_alias: str, expected: DataType, ) -> None: """Each accepted sqlglot DataType.Type alias collapses onto the canonical - SLayer DataType. Pinned under postgres dialect since some aliases only - parse cleanly there.""" + SLayer DataType. Pinned under postgres dialect since some aliases + (TIMESTAMPTZ) only parse cleanly there.""" # Pick a source column whose declared type makes an # admitted pair: identity on the expected canonical type. source_col = { @@ -1721,24 +1726,6 @@ def test_cast_sqlglot_type_aliases_map_to_slayer_datatype( assert result.projection_types == [expected] -@pytest.mark.parametrize("type_alias", ["DECIMAL", "NUMERIC", "TIMESTAMPTZ"]) -def test_cast_exact_or_timezone_aware_types_rejected(type_alias: str) -> None: - """Codex round 8: ``DECIMAL`` / ``NUMERIC`` and ``TIMESTAMPTZ`` were - silently coerced into ``DataType.DOUBLE`` / ``DataType.TIMESTAMP`` - respectively. PostgreSQL exposes them as OID 1700 (`numeric`, exact) - and OID 1184 (`timestamptz`, TZ-aware) — distinct from the OID 701 / - 1114 the coerced mapping would advertise — so clients got the wrong - wire type. Pinned as rejected (falls through to the unsupported- - projection-expression error) until SLayer adds native NUMERIC and - TIMESTAMPTZ types + OIDs + encoders.""" - with pytest.raises(TranslationError) as exc_info: - translate( - sql=f"SELECT CAST(revenue AS {type_alias}) FROM orders", - catalog=_catalog(), dialect="postgres", - ) - assert "Unsupported projection expression" in str(exc_info.value) - - def test_cast_parameterised_type_form_works() -> None: """sqlglot represents ``VARCHAR(255)`` etc. with their precision modifier on the SAME ``DataType.Type`` member, so the mapping collapses precision