diffly/diffly/_conditions.py at 8dfe919e11919c051e60d882f903ae5db0f46dcb · Quantco/diffly · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# Copyright (c) QuantCo 2025-2026
# SPDX-License-Identifier: BSD-3-Clause

import datetime as dt
from collections.abc import Mapping
from typing import cast

import polars as pl
from polars.datatypes import DataType, DataTypeClass

from diffly._utils import (
    ABS_TOL_DEFAULT,
    ABS_TOL_TEMPORAL_DEFAULT,
    REL_TOL_DEFAULT,
    Side,
)


def condition_equal_rows(
    columns: list[str],
    schema_left: pl.Schema,
    schema_right: pl.Schema,
    max_list_lengths_by_column: Mapping[str, int],
    abs_tol_by_column: Mapping[str, float],
    rel_tol_by_column: Mapping[str, float],
    abs_tol_temporal_by_column: Mapping[str, dt.timedelta],
) -> pl.Expr:
    """Build an expression whether two rows are equal, based on all columns' data
    types."""
    if not columns:
        return pl.lit(True)

    return pl.all_horizontal(
        [
            condition_equal_columns(
                column=column,
                dtype_left=schema_left[column],
                dtype_right=schema_right[column],
                max_list_length=max_list_lengths_by_column.get(column),
                abs_tol=abs_tol_by_column[column],
                rel_tol=rel_tol_by_column[column],
                abs_tol_temporal=abs_tol_temporal_by_column[column],
            )
            for column in columns
        ]
    )


def condition_equal_columns(
    column: str,
    dtype_left: pl.DataType,
    dtype_right: pl.DataType,
    max_list_length: int | None,
    abs_tol: float = ABS_TOL_DEFAULT,
    rel_tol: float = REL_TOL_DEFAULT,
    abs_tol_temporal: dt.timedelta = ABS_TOL_TEMPORAL_DEFAULT,
) -> pl.Expr:
    """Build an expression whether two columns are equal, depending on the columns' data
    types."""
    return _compare_columns(
        col_left=pl.col(f"{column}_{Side.LEFT}"),
        col_right=pl.col(f"{column}_{Side.RIGHT}"),
        dtype_left=dtype_left,
        dtype_right=dtype_right,
        max_list_length=max_list_length,
        abs_tol=abs_tol,
        rel_tol=rel_tol,
        abs_tol_temporal=abs_tol_temporal,
    )


# --------------------------------------- UTILS -------------------------------------- #


def _can_compare_dtypes(
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
) -> bool:
    return (
        (dtype_left == dtype_right)
        or (dtype_left == pl.Null)
        or (dtype_right == pl.Null)
        or (
            (
                (dtype_left.is_numeric() or dtype_left == pl.Boolean)
                == (dtype_right.is_numeric() or dtype_right == pl.Boolean)
            )
            and (dtype_left.is_temporal() == dtype_right.is_temporal())
            and (dtype_left.is_nested() == dtype_right.is_nested())
            and ((dtype_left == pl.Struct) == (dtype_right == pl.Struct))
        )
    )


def _compare_columns(
    col_left: pl.Expr,
    col_right: pl.Expr,
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
    max_list_length: int | None,
    abs_tol: float,
    rel_tol: float,
    abs_tol_temporal: dt.timedelta,
) -> pl.Expr:
    """Build an expression whether two expressions yield the same value.

    This method is more generic than :meth:`condition_equal_columns` as it accepts two
    arbitrary expressions rather than a "base column name".
    """
    if not _can_compare_dtypes(dtype_left, dtype_right):
        return pl.repeat(pl.lit(False), pl.len())

    # If we encounter nested dtypes, we have to treat them specially
    if dtype_left.is_nested():
        if isinstance(dtype_left, pl.Struct):
            assert isinstance(dtype_right, pl.Struct)
            # For two structs, we necessarily need to have matching field names (the
            # order does not matter). If that isn't the case, we cannot observe equality
            fields_left = {f.name: f.dtype for f in dtype_left.fields}
            fields_right = {f.name: f.dtype for f in dtype_right.fields}
            if fields_left.keys() != fields_right.keys():
                return pl.repeat(pl.lit(False), pl.len())

            # Otherwise, we simply compare all fields independently
            return pl.all_horizontal(
                [
                    _compare_columns(
                        col_left=col_left.struct[field],
                        col_right=col_right.struct[field],
                        dtype_left=fields_left[field],
                        dtype_right=fields_right[field],
                        max_list_length=max_list_length,
                        abs_tol=abs_tol,
                        rel_tol=rel_tol,
                        abs_tol_temporal=abs_tol_temporal,
                    )
                    for field in fields_left
                ]
            )
        elif isinstance(dtype_left, pl.List | pl.Array) and isinstance(
            dtype_right, pl.List | pl.Array
        ):
            if _needs_element_wise_comparison(dtype_left.inner, dtype_right.inner):
                return _compare_sequence_columns(
                    col_left=col_left,
                    col_right=col_right,
                    dtype_left=dtype_left,
                    dtype_right=dtype_right,
                    max_list_length=max_list_length,
                    abs_tol=abs_tol,
                    rel_tol=rel_tol,
                    abs_tol_temporal=abs_tol_temporal,
                )
            return col_left.eq_missing(col_right)

    if _different_enums(dtype_left, dtype_right) or _enum_and_categorical(
        dtype_left, dtype_right
    ):
        # Enums with different categories as well as enums and categoricals
        # can't be compared directly.
        # Fall back to comparison of strings.
        return _compare_columns(
            col_left=col_left.cast(pl.String),
            col_right=col_right.cast(pl.String),
            dtype_left=pl.String,
            dtype_right=pl.String,
            abs_tol=abs_tol,
            rel_tol=rel_tol,
            abs_tol_temporal=abs_tol_temporal,
            max_list_length=max_list_length,
        )

    return _compare_primitive_columns(
        col_left=col_left,
        col_right=col_right,
        dtype_left=dtype_left,
        dtype_right=dtype_right,
        abs_tol=abs_tol,
        rel_tol=rel_tol,
        abs_tol_temporal=abs_tol_temporal,
    )


def _compare_sequence_columns(
    col_left: pl.Expr,
    col_right: pl.Expr,
    dtype_left: pl.List | pl.Array,
    dtype_right: pl.List | pl.Array,
    max_list_length: int | None,
    abs_tol: float,
    rel_tol: float,
    abs_tol_temporal: dt.timedelta,
) -> pl.Expr:
    """Compare Array/List columns element-wise with tolerance."""
    n_elements: int
    has_same_length: pl.Expr

    if isinstance(dtype_left, pl.Array) and isinstance(dtype_right, pl.Array):
        if dtype_left.shape != dtype_right.shape:
            return pl.repeat(pl.lit(False), pl.len())
        n_elements = dtype_left.shape[0]
        has_same_length = pl.repeat(pl.lit(True), pl.len())
    elif isinstance(dtype_left, pl.Array) and isinstance(dtype_right, pl.List):
        n_elements = dtype_left.shape[0]
        has_same_length = col_right.list.len().eq(pl.lit(n_elements))
    elif isinstance(dtype_left, pl.List) and isinstance(dtype_right, pl.Array):
        n_elements = dtype_right.shape[0]
        has_same_length = col_left.list.len().eq(pl.lit(n_elements))
    else:  # pl.List vs pl.List
        n_elements = cast(int, max_list_length)
        has_same_length = col_left.list.len().eq_missing(col_right.list.len())

    if n_elements == 0:
        return _eq_missing(pl.lit(True), col_left, col_right)

    def _get_element(col: pl.Expr, dtype: DataType | DataTypeClass, i: int) -> pl.Expr:
        if isinstance(dtype, pl.Array):
            return col.arr.get(i)
        return col.list.get(i, null_on_oob=True)

    elements_match = pl.all_horizontal(
        [
            _compare_columns(
                col_left=_get_element(col_left, dtype_left, i),
                col_right=_get_element(col_right, dtype_right, i),
                dtype_left=dtype_left.inner,
                dtype_right=dtype_right.inner,
                abs_tol=abs_tol,
                rel_tol=rel_tol,
                abs_tol_temporal=abs_tol_temporal,
                max_list_length=max_list_length,
            )
            for i in range(n_elements)
        ]
    )

    return _eq_missing(has_same_length & elements_match, col_left, col_right)


def _is_float_numeric_pair(
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
) -> bool:
    return (dtype_left.is_float() or dtype_right.is_float()) and (
        dtype_left.is_numeric() and dtype_right.is_numeric()
    )


def _is_temporal_pair(
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
) -> bool:
    return dtype_left.is_temporal() and dtype_right.is_temporal()


def _needs_element_wise_comparison(
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
) -> bool:
    """Check if two dtypes require element-wise comparison (tolerances or special
    handling).

    Returns False when eq_missing() on the whole column would produce identical results,
    allowing us to skip the expensive element-wise iteration for list/array columns.
    """
    if (
        _is_float_numeric_pair(dtype_left, dtype_right)
        or _is_temporal_pair(dtype_left, dtype_right)
        or _different_enums(dtype_left, dtype_right)
        or _enum_and_categorical(dtype_left, dtype_right)
    ):
        return True
    if isinstance(dtype_left, pl.Struct) and isinstance(dtype_right, pl.Struct):
        fields_left = {f.name: f.dtype for f in dtype_left.fields}
        fields_right = {f.name: f.dtype for f in dtype_right.fields}
        return any(
            _needs_element_wise_comparison(fields_left[name], fields_right[name])
            for name in fields_left
            if name in fields_right
        )
    if isinstance(dtype_left, pl.List | pl.Array) and isinstance(
        dtype_right, pl.List | pl.Array
    ):
        return _needs_element_wise_comparison(dtype_left.inner, dtype_right.inner)
    return False


def _compare_primitive_columns(
    col_left: pl.Expr,
    col_right: pl.Expr,
    dtype_left: DataType | DataTypeClass,
    dtype_right: DataType | DataTypeClass,
    abs_tol: float,
    rel_tol: float,
    abs_tol_temporal: dt.timedelta,
) -> pl.Expr:
    if _is_float_numeric_pair(dtype_left, dtype_right):
        return col_left.is_close(col_right, abs_tol=abs_tol, rel_tol=rel_tol).pipe(
            _eq_missing_with_nan, lhs=col_left, rhs=col_right
        )
    elif _is_temporal_pair(dtype_left, dtype_right):
        diff_less_than_tolerance = (col_left - col_right).abs() <= abs_tol_temporal
        return diff_less_than_tolerance.pipe(_eq_missing, lhs=col_left, rhs=col_right)

    return col_left.eq_missing(col_right)


def _eq_missing(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
    both_null = lhs.is_null() & rhs.is_null()
    both_not_null = lhs.is_not_null() & rhs.is_not_null()
    return (expr & both_not_null) | both_null


def _eq_missing_with_nan(expr: pl.Expr, lhs: pl.Expr, rhs: pl.Expr) -> pl.Expr:
    both_nan = lhs.is_nan() & rhs.is_nan()
    return _eq_missing(expr, lhs, rhs) | both_nan


def _different_enums(
    left: DataType | DataTypeClass, right: DataType | DataTypeClass
) -> bool:
    return isinstance(left, pl.Enum) and isinstance(right, pl.Enum) and left != right


def _enum_and_categorical(
    left: DataType | DataTypeClass, right: DataType | DataTypeClass
) -> bool:
    return (isinstance(left, pl.Enum) and isinstance(right, pl.Categorical)) or (
        isinstance(left, pl.Categorical) and isinstance(right, pl.Enum)
    )