Skip to content

Commit edda36e

Browse files
add logic to breaking categorizer
Signed-off-by: Alberto Suman <alberto.suman@1komma5grad.com>
1 parent 960775f commit edda36e

2 files changed

Lines changed: 159 additions & 2 deletions

File tree

sqlmesh/core/model/definition.py

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1588,7 +1588,7 @@ def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
15881588

15891589
for edit in edits:
15901590
if not isinstance(edit, Insert):
1591-
return None
1591+
return _additive_projection_change(previous_query, this_query, self.dialect)
15921592

15931593
expr = edit.expression
15941594
if isinstance(expr, exp.UDTF):
@@ -1602,7 +1602,7 @@ def is_breaking_change(self, previous: Model) -> t.Optional[bool]:
16021602
expr = parent
16031603

16041604
if not _is_projection(expr) and expr.parent not in inserted_expressions:
1605-
return None
1605+
return _additive_projection_change(previous_query, this_query, self.dialect)
16061606

16071607
return False
16081608

@@ -2907,6 +2907,75 @@ def _is_projection(expr: exp.Expr) -> bool:
29072907
return isinstance(parent, exp.Select) and expr.arg_key == "expressions"
29082908

29092909

2910+
def _additive_projection_change(
2911+
previous_query: exp.Query,
2912+
this_query: exp.Query,
2913+
dialect: DialectType,
2914+
) -> t.Optional[bool]:
2915+
"""Fallback for when SQLGlot's tree diff can't express an additive projection change.
2916+
2917+
SQLGlot's diff matches nodes by structural similarity, so interchangeable leaves (e.g. two
2918+
identical ``CAST(... AS T)`` target types) can be cross-matched. Inserting a same-type cast
2919+
above an existing one therefore yields spurious ``Move`` / ``Update`` edits even though a
2920+
column was simply added to the SELECT list. In that case the edit-based check above is
2921+
inconclusive, so we verify additivity directly against the output projections.
2922+
2923+
Returns ``False`` (non-breaking) only when the change is provably additive:
2924+
* both queries are simple ``SELECT`` statements,
2925+
* everything other than the projection list is structurally identical,
2926+
* no added projection is a (potentially cardinality-changing) ``UDTF``, and
2927+
* every previous projection is preserved, in order, within the new projection list.
2928+
2929+
Otherwise returns ``None`` (undetermined), preserving the conservative default.
2930+
"""
2931+
# UNIONs or other query expressions, are left to the caller's conservative diff result.
2932+
if not isinstance(previous_query, exp.Select) or not isinstance(this_query, exp.Select):
2933+
return None
2934+
2935+
previous_projections = previous_query.expressions
2936+
this_projections = this_query.expressions
2937+
# If the new query has not gained any projections, this cannot be an additive projection-only
2938+
# change, so there is nothing for this fallback to prove.
2939+
if len(this_projections) <= len(previous_projections):
2940+
return None
2941+
2942+
# Adding a UDTF projection (e.g. EXPLODE / UNNEST) can change row cardinality, so such a
2943+
# change is not safely non-breaking even when it appears as an extra SELECT item.
2944+
for projection in this_projections:
2945+
if isinstance(projection, exp.UDTF) and not projection.find_ancestor(exp.Subquery):
2946+
return None
2947+
2948+
# Everything other than the projection list must be structurally identical. Replacing each
2949+
# SELECT list with the same dummy literal lets the expression equality check focus on the
2950+
# FROM / WHERE / GROUP BY / ORDER BY / etc. parts of the query.
2951+
previous_skeleton = previous_query.copy()
2952+
this_skeleton = this_query.copy()
2953+
previous_skeleton.set("expressions", [exp.Literal.number(1)])
2954+
this_skeleton.set("expressions", [exp.Literal.number(1)])
2955+
if previous_skeleton != this_skeleton:
2956+
return None
2957+
2958+
# Every previous projection must appear, in order, within the new projection list. Comparing
2959+
# dialect-normalized SQL makes semantically equivalent projection nodes match even when the
2960+
# parser built distinct object identities.
2961+
this_projection_sql = [p.sql(dialect=dialect, comments=False) for p in this_projections]
2962+
search_start = 0
2963+
for projection in previous_projections:
2964+
target_sql = projection.sql(dialect=dialect, comments=False)
2965+
# Continue after the previous match so added columns can appear before, between, or after
2966+
# the original projections, but existing projections cannot be reordered or rewritten.
2967+
for index in range(search_start, len(this_projection_sql)):
2968+
if this_projection_sql[index] == target_sql:
2969+
search_start = index + 1
2970+
break
2971+
else:
2972+
return None
2973+
2974+
# At this point the query shape is unchanged and all prior outputs are preserved, so the only
2975+
# remaining difference is one or more additional, non-UDTF projections.
2976+
return False
2977+
2978+
29102979
def _single_expr_or_tuple(values: t.Sequence[exp.Expr]) -> exp.Expr | exp.Tuple:
29112980
return values[0] if len(values) == 1 else exp.Tuple(expressions=values)
29122981

tests/core/test_snapshot.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,6 +1633,94 @@ def test_categorize_change_sql(make_snapshot):
16331633
)
16341634

16351635

1636+
def test_categorize_change_sql_redundant_cast(make_snapshot):
1637+
# Adding a column with a redundant CAST above an existing same-type cast makes SQLGlot's tree
1638+
# diff emit spurious Move/Update edits (interchangeable DataType leaves get cross-matched),
1639+
# even though the change is purely an added projection. These must remain NON_BREAKING.
1640+
config = CategorizerConfig(sql=AutoCategorizationMode.SEMI)
1641+
1642+
old_snapshot = make_snapshot(
1643+
SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t"))
1644+
)
1645+
1646+
# A same-type cast column has been inserted mid-list, above the existing one.
1647+
assert (
1648+
categorize_change(
1649+
new=make_snapshot(
1650+
SqlModel(name="a", query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t"))
1651+
),
1652+
old=old_snapshot,
1653+
config=config,
1654+
)
1655+
== SnapshotChangeCategory.NON_BREAKING
1656+
)
1657+
1658+
# Multiple same-type cast columns have been inserted mid-list.
1659+
assert (
1660+
categorize_change(
1661+
new=make_snapshot(
1662+
SqlModel(
1663+
name="a", query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT, y::INT FROM t")
1664+
)
1665+
),
1666+
old=old_snapshot,
1667+
config=config,
1668+
)
1669+
== SnapshotChangeCategory.NON_BREAKING
1670+
)
1671+
1672+
# The type of an existing projection has changed (in addition to a new column): undetermined.
1673+
assert (
1674+
categorize_change(
1675+
new=make_snapshot(
1676+
SqlModel(name="a", query=parse_one("SELECT a::INT, x::TEXT, s::TEXT FROM t"))
1677+
),
1678+
old=old_snapshot,
1679+
config=config,
1680+
)
1681+
is None
1682+
)
1683+
1684+
# Existing projections have been reordered: undetermined.
1685+
assert (
1686+
categorize_change(
1687+
new=make_snapshot(
1688+
SqlModel(name="a", query=parse_one("SELECT s::TEXT, a::DATE FROM t"))
1689+
),
1690+
old=old_snapshot,
1691+
config=config,
1692+
)
1693+
is None
1694+
)
1695+
1696+
# An existing projection has been removed: undetermined.
1697+
assert (
1698+
categorize_change(
1699+
new=make_snapshot(SqlModel(name="a", query=parse_one("SELECT s::TEXT FROM t"))),
1700+
old=old_snapshot,
1701+
config=config,
1702+
)
1703+
is None
1704+
)
1705+
1706+
# A WHERE clause changed alongside the added cast column: undetermined.
1707+
assert (
1708+
categorize_change(
1709+
new=make_snapshot(
1710+
SqlModel(
1711+
name="a",
1712+
query=parse_one("SELECT a::DATE, x::TEXT, s::TEXT FROM t WHERE a = 2"),
1713+
)
1714+
),
1715+
old=make_snapshot(
1716+
SqlModel(name="a", query=parse_one("SELECT a::DATE, s::TEXT FROM t WHERE a = 1"))
1717+
),
1718+
config=config,
1719+
)
1720+
is None
1721+
)
1722+
1723+
16361724
def test_categorize_change_seed(make_snapshot, tmp_path):
16371725
config = CategorizerConfig(seed=AutoCategorizationMode.SEMI)
16381726
model_name = "test_db.test_seed_model"

0 commit comments

Comments
 (0)