Skip to content

Commit 2ffaf19

Browse files
author
ci bot
committed
Merge branch 'feat/TG-999-pii-masking' into 'enterprise'
feat: pii masking, xde, hash fingerprints See merge request dkinternal/testgen/dataops-testgen!439
2 parents 44aaa3f + 0146128 commit 2ffaf19

86 files changed

Lines changed: 1947 additions & 517 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

testgen/commands/queries/execute_tests_query.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,16 +326,21 @@ def _get_params(self, test_def: TestExecutionDef | None = None) -> dict:
326326
# Freshness exclusion params — computed per test at execution time
327327
if test_def.test_type == "Freshness_Trend" and test_def.baseline_sum:
328328
sched = get_schedule_params(test_def.prediction)
329-
has_exclusions = self._exclude_weekends or sched.excluded_days or sched.window_start is not None
329+
# Once the schedule is active (excluded_days derived from active_days),
330+
# it supersedes exclude_weekends as the single source of truth for
331+
# day exclusion — avoids conflicts where a detection day (e.g. Saturday)
332+
# is active per schedule but excluded per exclude_weekends.
333+
effective_exclude_weekends = False if sched.excluded_days else self._exclude_weekends
334+
has_exclusions = effective_exclude_weekends or sched.excluded_days or sched.window_start is not None
330335
if has_exclusions:
331336
last_update = pd.Timestamp(test_def.baseline_sum)
332-
excluded = int(count_excluded_minutes(
333-
last_update, self.run_date, self._exclude_weekends, self._holiday_dates,
337+
excluded = round(count_excluded_minutes(
338+
last_update, self.run_date, effective_exclude_weekends, self._holiday_dates,
334339
tz=self._schedule_tz, excluded_days=sched.excluded_days,
335340
window_start=sched.window_start, window_end=sched.window_end,
336341
))
337342
is_excl = 1 if is_excluded_day(
338-
pd.Timestamp(self.run_date), self._exclude_weekends, self._holiday_dates,
343+
pd.Timestamp(self.run_date), effective_exclude_weekends, self._holiday_dates,
339344
tz=self._schedule_tz, excluded_days=sched.excluded_days,
340345
window_start=sched.window_start, window_end=sched.window_end,
341346
) else 0

testgen/commands/queries/profiling_query.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ def update_profiling_results(self) -> list[tuple[str, dict]]:
167167
self._get_query("functional_datatype.sql"),
168168
self._get_query("functional_tabletype_stage.sql"),
169169
self._get_query("functional_tabletype_update.sql"),
170-
self._get_query("pii_flag.sql"),
171170
]
171+
if self.table_group.profile_flag_pii:
172+
queries.append(self._get_query("pii_flag.sql"))
173+
queries.append(self._get_query("pii_flag_update.sql"))
172174
if self.table_group.profile_flag_cdes:
173175
queries.append(self._get_query("cde_flagger_query.sql"))
174176
return queries

testgen/commands/run_profiling.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from testgen.common.mixpanel_service import MixpanelService
2929
from testgen.common.models import get_current_session, with_database_session
3030
from testgen.common.models.connection import Connection
31+
from testgen.common.models.data_column import DataColumnChars
3132
from testgen.common.models.profiling_run import ProfilingRun
3233
from testgen.common.models.table_group import TableGroup
3334
from testgen.common.models.test_suite import TestSuite
@@ -85,6 +86,8 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
8586
LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}")
8687
try:
8788
data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime)
89+
if table_group.profile_exclude_xde:
90+
data_chars = _exclude_xde_columns(data_chars, table_group.id)
8891
distinct_tables = {(column.table_name, column.record_ct) for column in data_chars}
8992

9093
profiling_run.set_progress("data_chars", "Completed")
@@ -144,6 +147,22 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
144147
"""
145148

146149

150+
def _exclude_xde_columns(data_chars: list[ColumnChars], table_group_id: UUID) -> list[ColumnChars]:
151+
"""Filter out columns marked as excluded_data_element in data_column_chars."""
152+
xde_columns = DataColumnChars.select_where(
153+
DataColumnChars.table_groups_id == table_group_id,
154+
DataColumnChars.excluded_data_element.is_(True),
155+
)
156+
if not xde_columns:
157+
return data_chars
158+
159+
excluded = {(col.table_name, col.column_name) for col in xde_columns}
160+
filtered = [col for col in data_chars if (col.table_name, col.column_name) not in excluded]
161+
if len(filtered) < len(data_chars):
162+
LOG.info(f"Excluding {len(data_chars) - len(filtered)} XDE columns from profiling")
163+
return filtered
164+
165+
147166
def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None:
148167
profiling_run = sql_generator.profiling_run
149168
profiling_run.set_progress("col_profiling", "Running")

testgen/commands/test_thresholds_prediction.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,11 @@ def compute_freshness_threshold(
213213
if schedule.stage == "active":
214214
excluded_days = frozenset(range(7)) - schedule.active_days if schedule.active_days else None
215215

216+
# Once the schedule is active, excluded_days is the single source of truth
217+
# for day exclusion — it supersedes exclude_weekends, which was the user's
218+
# initial hint before enough data was available for schedule inference.
219+
schedule_exclude_weekends = False if excluded_days else exclude_weekends
220+
216221
# For sub-daily schedules, apply window exclusion for overnight gaps
217222
has_window = (
218223
schedule.frequency == "sub_daily"
@@ -228,7 +233,7 @@ def compute_freshness_threshold(
228233
upper_percentile=upper_percentile,
229234
floor_multiplier=floor_multiplier,
230235
lower_percentile=lower_percentile,
231-
exclude_weekends=exclude_weekends,
236+
exclude_weekends=schedule_exclude_weekends,
232237
holiday_codes=holiday_codes,
233238
tz=schedule_tz,
234239
staleness_factor=staleness_factor,
@@ -246,7 +251,7 @@ def compute_freshness_threshold(
246251
holiday_dates = resolve_holiday_dates(holiday_codes, history.index) if holiday_codes else None
247252
schedule_upper = minutes_to_next_deadline(
248253
result.last_update, schedule,
249-
exclude_weekends, holiday_dates, schedule_tz,
254+
schedule_exclude_weekends, holiday_dates, schedule_tz,
250255
deadline_buffer, excluded_days=excluded_days,
251256
)
252257
if schedule_upper is not None:
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from uuid import UUID, uuid4
2+
3+
from sqlalchemy import Boolean, Column, ForeignKey, String
4+
from sqlalchemy.dialects import postgresql
5+
6+
from testgen.common.models.entity import Entity
7+
8+
9+
class DataColumnChars(Entity):
10+
__tablename__ = "data_column_chars"
11+
12+
id: UUID = Column("column_id", postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4)
13+
table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id"))
14+
schema_name: str = Column(String)
15+
table_name: str = Column(String)
16+
column_name: str = Column(String)
17+
excluded_data_element: bool | None = Column(Boolean, nullable=True)
18+
pii_flag: str | None = Column(String(50), nullable=True)
19+
20+
_default_order_by = (id,)
21+
22+
# Unmapped columns: table_id, ordinal_position, general_type, column_type,
23+
# db_data_type, functional_data_type, description, critical_data_element,
24+
# data_source, source_system, source_process, business_domain,
25+
# stakeholder_group, transform_level, aggregation_level, data_product,
26+
# add_date, last_mod_date, drop_date, test_ct, last_test_date,
27+
# tests_last_run, tests_7_days_prior, tests_30_days_prior,
28+
# fails_last_run, fails_7_days_prior, fails_30_days_prior,
29+
# warnings_last_run, warnings_7_days_prior, warnings_30_days_prior,
30+
# last_complete_profile_run_id, valid_profile_issue_ct,
31+
# valid_test_issue_ct, dq_score_profiling, dq_score_testing

testgen/common/models/table_group.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ class TableGroupMinimal(EntityMinimal):
2828
profile_use_sampling: bool
2929
profiling_delay_days: str
3030
monitor_test_suite_id: UUID | None
31+
profile_flag_cdes: bool
32+
profile_flag_pii: bool
33+
profile_exclude_xde: bool
3134
last_complete_profile_run_id: UUID | None
3235

3336

@@ -112,6 +115,8 @@ class TableGroup(Entity):
112115
profile_sample_min_count: int = Column(BigInteger, default=100000)
113116
profiling_delay_days: str = Column(String, default="0")
114117
profile_flag_cdes: bool = Column(Boolean, default=True)
118+
profile_flag_pii: bool = Column(Boolean, default=True)
119+
profile_exclude_xde: bool = Column(Boolean, default=True)
115120
profile_do_pair_rules: bool = Column(YNString, default="N")
116121
profile_pair_rule_pct: int = Column(Integer, default=95)
117122
include_in_dashboard: bool = Column(Boolean, default=True)

testgen/common/pii_masking.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""PII masking utilities for redacting sensitive data in the UI."""
2+
import pandas as pd
3+
4+
from testgen.ui.services.database_service import fetch_all_from_db
5+
6+
PII_REDACTED = "[PII Redacted]"
7+
8+
PROFILING_PII_FIELDS = (
9+
"top_freq_values", "min_text", "max_text",
10+
"min_value", "min_value_over_0", "max_value",
11+
"min_date", "max_date",
12+
)
13+
14+
15+
def get_pii_columns(table_group_id: str, schema: str | None = None, table_name: str | None = None) -> set[str]:
16+
"""Look up PII-flagged column names from data_column_chars."""
17+
18+
query = f"""
19+
SELECT column_name
20+
FROM data_column_chars
21+
WHERE table_groups_id = :table_group_id
22+
AND pii_flag IS NOT NULL
23+
{"AND schema_name = :schema" if schema else ""}
24+
{"AND table_name = :table_name" if table_name else ""}
25+
"""
26+
params: dict = {
27+
"table_group_id": table_group_id,
28+
"schema": schema,
29+
"table_name": table_name,
30+
}
31+
32+
results = fetch_all_from_db(query, params)
33+
return {row.column_name for row in results}
34+
35+
36+
def mask_source_data_pii(df: pd.DataFrame, pii_columns: set[str]) -> None:
37+
"""In-place mask values in PII columns with PII_REDACTED."""
38+
if df.empty or not pii_columns:
39+
return
40+
for col in pii_columns:
41+
# Match case-insensitively since column names may differ in case
42+
for df_col in df.columns:
43+
if df_col.lower() == col.lower():
44+
df[df_col] = PII_REDACTED
45+
46+
47+
def mask_hygiene_detail(data: pd.DataFrame | list[dict], pii_columns: set[str] | None = None) -> None:
48+
"""Redact hygiene issue detail for PII columns where detail_redactable is true.
49+
50+
Accepts:
51+
- DataFrame with detail_redactable, pii_flag, and detail columns (hygiene issues grid/export)
52+
- List of issue dicts, each with detail_redactable and either pii_flag or column_name
53+
(when pii_columns set is provided, matches column_name against it)
54+
"""
55+
if isinstance(data, pd.DataFrame):
56+
if data.empty or "detail_redactable" not in data.columns:
57+
return
58+
pii_mask = data["detail_redactable"].fillna(False) & data["pii_flag"].notna()
59+
data.loc[pii_mask, "detail"] = PII_REDACTED
60+
return
61+
62+
if not data:
63+
return
64+
pii_lower = {c.lower() for c in pii_columns} if pii_columns else None
65+
for issue in data:
66+
if not issue.get("detail_redactable"):
67+
continue
68+
if pii_lower is not None:
69+
if issue.get("column_name", "").lower() in pii_lower:
70+
issue["detail"] = PII_REDACTED
71+
elif issue.get("pii_flag"):
72+
issue["detail"] = PII_REDACTED
73+
74+
75+
def mask_profiling_pii(data: pd.DataFrame | dict, pii_columns: set[str]) -> None:
76+
"""Mask profiling fields for PII columns. Accepts a DataFrame or a single-row dict."""
77+
if isinstance(data, dict):
78+
if not pii_columns:
79+
return
80+
column_name = data.get("column_name")
81+
if column_name and column_name.lower() not in {c.lower() for c in pii_columns}:
82+
return
83+
for field in PROFILING_PII_FIELDS:
84+
if field in data:
85+
data[field] = PII_REDACTED
86+
return
87+
88+
if data.empty or not pii_columns:
89+
return
90+
pii_lower = {c.lower() for c in pii_columns}
91+
mask = data["column_name"].str.lower().isin(pii_lower)
92+
for field in PROFILING_PII_FIELDS:
93+
if field in data.columns:
94+
if data[field].dtype != object:
95+
data[field] = data[field].astype(object)
96+
data.loc[mask, field] = PII_REDACTED

testgen/template/dbsetup/030_initialize_new_schema_structure.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ CREATE TABLE table_groups
113113
profile_sample_min_count BIGINT DEFAULT 100000,
114114
profiling_delay_days VARCHAR(3) DEFAULT '0',
115115
profile_flag_cdes BOOLEAN DEFAULT TRUE,
116+
profile_flag_pii BOOLEAN DEFAULT TRUE,
117+
profile_exclude_xde BOOLEAN DEFAULT TRUE,
116118
profile_do_pair_rules VARCHAR(3) DEFAULT 'N',
117119
profile_pair_rule_pct INTEGER DEFAULT 95,
118120
include_in_dashboard BOOLEAN DEFAULT TRUE,
@@ -341,6 +343,7 @@ CREATE TABLE profile_anomaly_types (
341343
anomaly_description VARCHAR(500),
342344
anomaly_criteria VARCHAR(2000),
343345
detail_expression VARCHAR(2000),
346+
detail_redactable BOOLEAN DEFAULT FALSE,
344347
issue_likelihood VARCHAR(50), -- Potential, Likely, Certain
345348
suggested_action VARCHAR(1000),
346349
dq_score_prevalence_formula TEXT,
@@ -447,6 +450,8 @@ CREATE TABLE data_column_chars (
447450
functional_data_type VARCHAR(50),
448451
description VARCHAR(1000),
449452
critical_data_element BOOLEAN,
453+
excluded_data_element BOOLEAN,
454+
pii_flag VARCHAR(50),
450455
data_source VARCHAR(40),
451456
source_system VARCHAR(40),
452457
source_process VARCHAR(40),
@@ -609,6 +614,7 @@ CREATE TABLE target_data_lookups (
609614
sql_flavor VARCHAR(20) NOT NULL,
610615
lookup_type VARCHAR(10),
611616
lookup_query VARCHAR,
617+
lookup_redactable_columns VARCHAR(100),
612618
error_type VARCHAR(30) NOT NULL,
613619
CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk
614620
PRIMARY KEY (test_id, sql_flavor, error_type)

testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ profile_anomaly_types:
1616
detail_expression: |-
1717
CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text
1818
ELSE 'Top Freq: ' || p.top_freq_values END
19+
detail_redactable: true
1920
issue_likelihood: Likely
2021
suggested_action: "Review your source data and follow-up with data owners to determine\
2122
\ whether this data needs to be corrected. "

testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ profile_anomaly_types:
99
p.std_pattern_match = 'DELIMITED_DATA'
1010
detail_expression: |-
1111
CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END
12+
detail_redactable: true
1213
issue_likelihood: Likely
1314
suggested_action: |-
1415
Review your source data and follow-up with data consumers to determine the most useful representation of this data.

0 commit comments

Comments
 (0)