Skip to content

Commit 9a0daf0

Browse files
author
testgen-ci-bot
committed
Merge remote-tracking branch 'origin/enterprise' into fix/scheduler-shutdown-race
2 parents 4cdb3f2 + 2ffaf19 commit 9a0daf0

96 files changed

Lines changed: 2123 additions & 581 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

testgen/commands/queries/execute_tests_query.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -326,16 +326,21 @@ def _get_params(self, test_def: TestExecutionDef | None = None) -> dict:
326326
# Freshness exclusion params — computed per test at execution time
327327
if test_def.test_type == "Freshness_Trend" and test_def.baseline_sum:
328328
sched = get_schedule_params(test_def.prediction)
329-
has_exclusions = self._exclude_weekends or sched.excluded_days or sched.window_start is not None
329+
# Once the schedule is active (excluded_days derived from active_days),
330+
# it supersedes exclude_weekends as the single source of truth for
331+
# day exclusion — avoids conflicts where a detection day (e.g. Saturday)
332+
# is active per schedule but excluded per exclude_weekends.
333+
effective_exclude_weekends = False if sched.excluded_days else self._exclude_weekends
334+
has_exclusions = effective_exclude_weekends or sched.excluded_days or sched.window_start is not None
330335
if has_exclusions:
331336
last_update = pd.Timestamp(test_def.baseline_sum)
332-
excluded = int(count_excluded_minutes(
333-
last_update, self.run_date, self._exclude_weekends, self._holiday_dates,
337+
excluded = round(count_excluded_minutes(
338+
last_update, self.run_date, effective_exclude_weekends, self._holiday_dates,
334339
tz=self._schedule_tz, excluded_days=sched.excluded_days,
335340
window_start=sched.window_start, window_end=sched.window_end,
336341
))
337342
is_excl = 1 if is_excluded_day(
338-
pd.Timestamp(self.run_date), self._exclude_weekends, self._holiday_dates,
343+
pd.Timestamp(self.run_date), effective_exclude_weekends, self._holiday_dates,
339344
tz=self._schedule_tz, excluded_days=sched.excluded_days,
340345
window_start=sched.window_start, window_end=sched.window_end,
341346
) else 0

testgen/commands/queries/profiling_query.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,8 +167,10 @@ def update_profiling_results(self) -> list[tuple[str, dict]]:
167167
self._get_query("functional_datatype.sql"),
168168
self._get_query("functional_tabletype_stage.sql"),
169169
self._get_query("functional_tabletype_update.sql"),
170-
self._get_query("pii_flag.sql"),
171170
]
171+
if self.table_group.profile_flag_pii:
172+
queries.append(self._get_query("pii_flag.sql"))
173+
queries.append(self._get_query("pii_flag_update.sql"))
172174
if self.table_group.profile_flag_cdes:
173175
queries.append(self._get_query("cde_flagger_query.sql"))
174176
return queries

testgen/commands/run_profiling.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from testgen.common.mixpanel_service import MixpanelService
2929
from testgen.common.models import get_current_session, with_database_session
3030
from testgen.common.models.connection import Connection
31+
from testgen.common.models.data_column import DataColumnChars
3132
from testgen.common.models.profiling_run import ProfilingRun
3233
from testgen.common.models.table_group import TableGroup
3334
from testgen.common.models.test_suite import TestSuite
@@ -85,6 +86,8 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
8586
LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}")
8687
try:
8788
data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime)
89+
if table_group.profile_exclude_xde:
90+
data_chars = _exclude_xde_columns(data_chars, table_group.id)
8891
distinct_tables = {(column.table_name, column.record_ct) for column in data_chars}
8992

9093
profiling_run.set_progress("data_chars", "Completed")
@@ -144,6 +147,22 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
144147
"""
145148

146149

150+
def _exclude_xde_columns(data_chars: list[ColumnChars], table_group_id: UUID) -> list[ColumnChars]:
151+
"""Filter out columns marked as excluded_data_element in data_column_chars."""
152+
xde_columns = DataColumnChars.select_where(
153+
DataColumnChars.table_groups_id == table_group_id,
154+
DataColumnChars.excluded_data_element.is_(True),
155+
)
156+
if not xde_columns:
157+
return data_chars
158+
159+
excluded = {(col.table_name, col.column_name) for col in xde_columns}
160+
filtered = [col for col in data_chars if (col.table_name, col.column_name) not in excluded]
161+
if len(filtered) < len(data_chars):
162+
LOG.info(f"Excluding {len(data_chars) - len(filtered)} XDE columns from profiling")
163+
return filtered
164+
165+
147166
def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None:
148167
profiling_run = sql_generator.profiling_run
149168
profiling_run.set_progress("col_profiling", "Running")

testgen/commands/test_thresholds_prediction.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,11 @@ def compute_freshness_threshold(
213213
if schedule.stage == "active":
214214
excluded_days = frozenset(range(7)) - schedule.active_days if schedule.active_days else None
215215

216+
# Once the schedule is active, excluded_days is the single source of truth
217+
# for day exclusion — it supersedes exclude_weekends, which was the user's
218+
# initial hint before enough data was available for schedule inference.
219+
schedule_exclude_weekends = False if excluded_days else exclude_weekends
220+
216221
# For sub-daily schedules, apply window exclusion for overnight gaps
217222
has_window = (
218223
schedule.frequency == "sub_daily"
@@ -228,7 +233,7 @@ def compute_freshness_threshold(
228233
upper_percentile=upper_percentile,
229234
floor_multiplier=floor_multiplier,
230235
lower_percentile=lower_percentile,
231-
exclude_weekends=exclude_weekends,
236+
exclude_weekends=schedule_exclude_weekends,
232237
holiday_codes=holiday_codes,
233238
tz=schedule_tz,
234239
staleness_factor=staleness_factor,
@@ -246,7 +251,7 @@ def compute_freshness_threshold(
246251
holiday_dates = resolve_holiday_dates(holiday_codes, history.index) if holiday_codes else None
247252
schedule_upper = minutes_to_next_deadline(
248253
result.last_update, schedule,
249-
exclude_weekends, holiday_dates, schedule_tz,
254+
schedule_exclude_weekends, holiday_dates, schedule_tz,
250255
deadline_buffer, excluded_days=excluded_days,
251256
)
252257
if schedule_upper is not None:
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from uuid import UUID, uuid4
2+
3+
from sqlalchemy import Boolean, Column, ForeignKey, String
4+
from sqlalchemy.dialects import postgresql
5+
6+
from testgen.common.models.entity import Entity
7+
8+
9+
class DataColumnChars(Entity):
10+
__tablename__ = "data_column_chars"
11+
12+
id: UUID = Column("column_id", postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4)
13+
table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id"))
14+
schema_name: str = Column(String)
15+
table_name: str = Column(String)
16+
column_name: str = Column(String)
17+
excluded_data_element: bool | None = Column(Boolean, nullable=True)
18+
pii_flag: str | None = Column(String(50), nullable=True)
19+
20+
_default_order_by = (id,)
21+
22+
# Unmapped columns: table_id, ordinal_position, general_type, column_type,
23+
# db_data_type, functional_data_type, description, critical_data_element,
24+
# data_source, source_system, source_process, business_domain,
25+
# stakeholder_group, transform_level, aggregation_level, data_product,
26+
# add_date, last_mod_date, drop_date, test_ct, last_test_date,
27+
# tests_last_run, tests_7_days_prior, tests_30_days_prior,
28+
# fails_last_run, fails_7_days_prior, fails_30_days_prior,
29+
# warnings_last_run, warnings_7_days_prior, warnings_30_days_prior,
30+
# last_complete_profile_run_id, valid_profile_issue_ct,
31+
# valid_test_issue_ct, dq_score_profiling, dq_score_testing

testgen/common/models/table_group.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ class TableGroupMinimal(EntityMinimal):
2828
profile_use_sampling: bool
2929
profiling_delay_days: str
3030
monitor_test_suite_id: UUID | None
31+
profile_flag_cdes: bool
32+
profile_flag_pii: bool
33+
profile_exclude_xde: bool
3134
last_complete_profile_run_id: UUID | None
3235

3336

@@ -112,6 +115,8 @@ class TableGroup(Entity):
112115
profile_sample_min_count: int = Column(BigInteger, default=100000)
113116
profiling_delay_days: str = Column(String, default="0")
114117
profile_flag_cdes: bool = Column(Boolean, default=True)
118+
profile_flag_pii: bool = Column(Boolean, default=True)
119+
profile_exclude_xde: bool = Column(Boolean, default=True)
115120
profile_do_pair_rules: bool = Column(YNString, default="N")
116121
profile_pair_rule_pct: int = Column(Integer, default=95)
117122
include_in_dashboard: bool = Column(Boolean, default=True)

testgen/common/pii_masking.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""PII masking utilities for redacting sensitive data in the UI."""
2+
import pandas as pd
3+
4+
from testgen.ui.services.database_service import fetch_all_from_db
5+
6+
PII_REDACTED = "[PII Redacted]"
7+
8+
PROFILING_PII_FIELDS = (
9+
"top_freq_values", "min_text", "max_text",
10+
"min_value", "min_value_over_0", "max_value",
11+
"min_date", "max_date",
12+
)
13+
14+
15+
def get_pii_columns(table_group_id: str, schema: str | None = None, table_name: str | None = None) -> set[str]:
16+
"""Look up PII-flagged column names from data_column_chars."""
17+
18+
query = f"""
19+
SELECT column_name
20+
FROM data_column_chars
21+
WHERE table_groups_id = :table_group_id
22+
AND pii_flag IS NOT NULL
23+
{"AND schema_name = :schema" if schema else ""}
24+
{"AND table_name = :table_name" if table_name else ""}
25+
"""
26+
params: dict = {
27+
"table_group_id": table_group_id,
28+
"schema": schema,
29+
"table_name": table_name,
30+
}
31+
32+
results = fetch_all_from_db(query, params)
33+
return {row.column_name for row in results}
34+
35+
36+
def mask_source_data_pii(df: pd.DataFrame, pii_columns: set[str]) -> None:
37+
"""In-place mask values in PII columns with PII_REDACTED."""
38+
if df.empty or not pii_columns:
39+
return
40+
for col in pii_columns:
41+
# Match case-insensitively since column names may differ in case
42+
for df_col in df.columns:
43+
if df_col.lower() == col.lower():
44+
df[df_col] = PII_REDACTED
45+
46+
47+
def mask_hygiene_detail(data: pd.DataFrame | list[dict], pii_columns: set[str] | None = None) -> None:
48+
"""Redact hygiene issue detail for PII columns where detail_redactable is true.
49+
50+
Accepts:
51+
- DataFrame with detail_redactable, pii_flag, and detail columns (hygiene issues grid/export)
52+
- List of issue dicts, each with detail_redactable and either pii_flag or column_name
53+
(when pii_columns set is provided, matches column_name against it)
54+
"""
55+
if isinstance(data, pd.DataFrame):
56+
if data.empty or "detail_redactable" not in data.columns:
57+
return
58+
pii_mask = data["detail_redactable"].fillna(False) & data["pii_flag"].notna()
59+
data.loc[pii_mask, "detail"] = PII_REDACTED
60+
return
61+
62+
if not data:
63+
return
64+
pii_lower = {c.lower() for c in pii_columns} if pii_columns else None
65+
for issue in data:
66+
if not issue.get("detail_redactable"):
67+
continue
68+
if pii_lower is not None:
69+
if issue.get("column_name", "").lower() in pii_lower:
70+
issue["detail"] = PII_REDACTED
71+
elif issue.get("pii_flag"):
72+
issue["detail"] = PII_REDACTED
73+
74+
75+
def mask_profiling_pii(data: pd.DataFrame | dict, pii_columns: set[str]) -> None:
76+
"""Mask profiling fields for PII columns. Accepts a DataFrame or a single-row dict."""
77+
if isinstance(data, dict):
78+
if not pii_columns:
79+
return
80+
column_name = data.get("column_name")
81+
if column_name and column_name.lower() not in {c.lower() for c in pii_columns}:
82+
return
83+
for field in PROFILING_PII_FIELDS:
84+
if field in data:
85+
data[field] = PII_REDACTED
86+
return
87+
88+
if data.empty or not pii_columns:
89+
return
90+
pii_lower = {c.lower() for c in pii_columns}
91+
mask = data["column_name"].str.lower().isin(pii_lower)
92+
for field in PROFILING_PII_FIELDS:
93+
if field in data.columns:
94+
if data[field].dtype != object:
95+
data[field] = data[field].astype(object)
96+
data.loc[mask, field] = PII_REDACTED

testgen/mcp/exceptions.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""MCP exception hierarchy and error boundary.
2+
3+
``MCPUserError`` (and its subclasses) carry safe, user-facing messages.
4+
``mcp_error_boundary`` is a decorator that catches them and converts to
5+
text, while neutralising unexpected exceptions.
6+
"""
7+
8+
import functools
9+
import logging
10+
11+
LOG = logging.getLogger("testgen")
12+
13+
14+
class MCPUserError(Exception):
15+
"""Safe, user-facing error for MCP tools, prompts, and resources.
16+
17+
The error boundary converts ``str(e)`` into the response text.
18+
All other exceptions are treated as unexpected: their traceback is
19+
logged and a neutral message is returned to the client.
20+
"""
21+
22+
23+
class MCPPermissionDenied(MCPUserError):
24+
"""Raised when access is denied due to insufficient project permissions."""
25+
26+
27+
def mcp_error_handler(fn):
28+
"""Wrap an MCP handler (tool, resource, or prompt) with safe error handling.
29+
30+
- ``MCPUserError`` (including ``MCPPermissionDenied``) → ``str(e)`` as the response.
31+
- Any other exception → traceback logged, neutral message returned.
32+
"""
33+
34+
@functools.wraps(fn)
35+
def wrapper(*args, **kwargs):
36+
try:
37+
return fn(*args, **kwargs)
38+
except MCPUserError as e:
39+
return str(e)
40+
except Exception:
41+
LOG.exception("Unhandled error in MCP handler '%s'", fn.__name__)
42+
return "An unexpected error occurred."
43+
44+
return wrapper

testgen/mcp/permissions.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from testgen.common.models.project_membership import ProjectMembership
99
from testgen.common.models.user import User
10+
from testgen.mcp.exceptions import MCPPermissionDenied
1011
from testgen.utils.plugins import PluginHook
1112

1213
_NOT_SET = object()
@@ -17,10 +18,6 @@
1718
)
1819

1920

20-
class MCPPermissionDenied(Exception):
21-
"""Raised by ProjectPermissions when access is denied. Caught by the decorator."""
22-
23-
2421
@dataclass(frozen=True, slots=True)
2522
class ProjectPermissions:
2623
memberships: dict[str, str] # {project_code: role}
@@ -105,9 +102,9 @@ def mcp_permission(permission: str) -> Callable:
105102
permission, and stores it in a ContextVar. The tool retrieves the value
106103
via ``get_project_permissions()``.
107104
108-
If the user has no projects with the required permission, returns an
109-
early denial message. Catches MCPPermissionDenied raised by tool code
110-
and returns str(e) as the tool response.
105+
Raises ``MCPPermissionDenied`` if the user has no projects with the required
106+
permission. Other ``MCPPermissionDenied`` exceptions from tool code propagate
107+
through — the ``safe_tool`` error boundary handles conversion to text.
111108
"""
112109

113110
def decorator(fn: Callable) -> Callable:
@@ -116,12 +113,12 @@ def wrapper(*args, **kwargs):
116113
user = get_current_mcp_user()
117114
perms = _compute_project_permissions(user, permission)
118115
if not perms.allowed_codes:
119-
return "Your role does not include the necessary permission for this operation on any project."
116+
raise MCPPermissionDenied(
117+
"Your role does not include the necessary permission for this operation on any project."
118+
)
120119
tok = _mcp_project_permissions.set(perms)
121120
try:
122121
return fn(*args, **kwargs)
123-
except MCPPermissionDenied as e:
124-
return str(e)
125122
finally:
126123
_mcp_project_permissions.reset(tok)
127124

0 commit comments

Comments
 (0)