DataKitchen
diff --git a/‎testgen/commands/queries/execute_tests_query.py‎
Lines changed: 9 additions & 4 deletions b/‎testgen/commands/queries/execute_tests_query.py‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎testgen/commands/queries/profiling_query.py‎
Lines changed: 3 additions & 1 deletion b/‎testgen/commands/queries/profiling_query.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎testgen/commands/run_profiling.py‎
Lines changed: 19 additions & 0 deletions b/‎testgen/commands/run_profiling.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎testgen/commands/test_thresholds_prediction.py‎
Lines changed: 7 additions & 2 deletions b/‎testgen/commands/test_thresholds_prediction.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎testgen/common/models/data_column.py‎
Lines changed: 31 additions & 0 deletions b/‎testgen/common/models/data_column.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎testgen/common/models/table_group.py‎
Lines changed: 5 additions & 0 deletions b/‎testgen/common/models/table_group.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎testgen/common/pii_masking.py‎
Lines changed: 96 additions & 0 deletions b/‎testgen/common/pii_masking.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎testgen/template/dbsetup/030_initialize_new_schema_structure.sql‎
Lines changed: 6 additions & 0 deletions b/‎testgen/template/dbsetup/030_initialize_new_schema_structure.sql‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml‎
Lines changed: 1 addition & 0 deletions b/‎testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Boolean_Value_Mismatch.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml‎
Lines changed: 1 addition & 0 deletions b/‎testgen/template/dbsetup_anomaly_types/profile_anomaly_types_Delimited_Data_Embedded.yaml‎
Lines changed: 1 addition & 0 deletions
@@ -326,16 +326,21 @@ def _get_params(self, test_def: TestExecutionDef | None = None) -> dict:
             # Freshness exclusion params — computed per test at execution time
             if test_def.test_type == "Freshness_Trend" and test_def.baseline_sum:
                 sched = get_schedule_params(test_def.prediction)
-                has_exclusions = self._exclude_weekends or sched.excluded_days or sched.window_start is not None
+                # Once the schedule is active (excluded_days derived from active_days),
+                # it supersedes exclude_weekends as the single source of truth for
+                # day exclusion — avoids conflicts where a detection day (e.g. Saturday)
+                # is active per schedule but excluded per exclude_weekends.
+                effective_exclude_weekends = False if sched.excluded_days else self._exclude_weekends
+                has_exclusions = effective_exclude_weekends or sched.excluded_days or sched.window_start is not None
                 if has_exclusions:
                     last_update = pd.Timestamp(test_def.baseline_sum)
-                    excluded = int(count_excluded_minutes(
-                        last_update, self.run_date, self._exclude_weekends, self._holiday_dates,
+                    excluded = round(count_excluded_minutes(
+                        last_update, self.run_date, effective_exclude_weekends, self._holiday_dates,
                         tz=self._schedule_tz, excluded_days=sched.excluded_days,
                         window_start=sched.window_start, window_end=sched.window_end,
                     ))
                     is_excl = 1 if is_excluded_day(
-                        pd.Timestamp(self.run_date), self._exclude_weekends, self._holiday_dates,
+                        pd.Timestamp(self.run_date), effective_exclude_weekends, self._holiday_dates,
                         tz=self._schedule_tz, excluded_days=sched.excluded_days,
                         window_start=sched.window_start, window_end=sched.window_end,
                     ) else 0
 
@@ -167,8 +167,10 @@ def update_profiling_results(self) -> list[tuple[str, dict]]:
             self._get_query("functional_datatype.sql"),
             self._get_query("functional_tabletype_stage.sql"),
             self._get_query("functional_tabletype_update.sql"),
-            self._get_query("pii_flag.sql"),
         ]
+        if self.table_group.profile_flag_pii:
+            queries.append(self._get_query("pii_flag.sql"))
+            queries.append(self._get_query("pii_flag_update.sql"))
         if self.table_group.profile_flag_cdes:
             queries.append(self._get_query("cde_flagger_query.sql"))
         return queries
 
@@ -28,6 +28,7 @@
 from testgen.common.mixpanel_service import MixpanelService
 from testgen.common.models import get_current_session, with_database_session
 from testgen.common.models.connection import Connection
+from testgen.common.models.data_column import DataColumnChars
 from testgen.common.models.profiling_run import ProfilingRun
 from testgen.common.models.table_group import TableGroup
 from testgen.common.models.test_suite import TestSuite
@@ -85,6 +86,8 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
     LOG.info(f"Profiling run: {profiling_run.id}, Table group: {table_group.table_groups_name}, Connection: {connection.connection_name}")
     try:
         data_chars = run_data_chars_refresh(connection, table_group, profiling_run.profiling_starttime)
+        if table_group.profile_exclude_xde:
+            data_chars = _exclude_xde_columns(data_chars, table_group.id)
         distinct_tables = {(column.table_name, column.record_ct) for column in data_chars}
 
         profiling_run.set_progress("data_chars", "Completed")
@@ -144,6 +147,22 @@ def run_profiling(table_group_id: str | UUID, username: str | None = None, run_d
     """
 
 
+def _exclude_xde_columns(data_chars: list[ColumnChars], table_group_id: UUID) -> list[ColumnChars]:
+    """Filter out columns marked as excluded_data_element in data_column_chars."""
+    xde_columns = DataColumnChars.select_where(
+        DataColumnChars.table_groups_id == table_group_id,
+        DataColumnChars.excluded_data_element.is_(True),
+    )
+    if not xde_columns:
+        return data_chars
+
+    excluded = {(col.table_name, col.column_name) for col in xde_columns}
+    filtered = [col for col in data_chars if (col.table_name, col.column_name) not in excluded]
+    if len(filtered) < len(data_chars):
+        LOG.info(f"Excluding {len(data_chars) - len(filtered)} XDE columns from profiling")
+    return filtered
+
+
 def _run_column_profiling(sql_generator: ProfilingSQL, data_chars: list[ColumnChars]) -> None:
     profiling_run = sql_generator.profiling_run
     profiling_run.set_progress("col_profiling", "Running")
 
@@ -213,6 +213,11 @@ def compute_freshness_threshold(
     if schedule.stage == "active":
         excluded_days = frozenset(range(7)) - schedule.active_days if schedule.active_days else None
 
+        # Once the schedule is active, excluded_days is the single source of truth
+        # for day exclusion — it supersedes exclude_weekends, which was the user's
+        # initial hint before enough data was available for schedule inference.
+        schedule_exclude_weekends = False if excluded_days else exclude_weekends
+
         # For sub-daily schedules, apply window exclusion for overnight gaps
         has_window = (
             schedule.frequency == "sub_daily"
@@ -228,7 +233,7 @@ def compute_freshness_threshold(
                     upper_percentile=upper_percentile,
                     floor_multiplier=floor_multiplier,
                     lower_percentile=lower_percentile,
-                    exclude_weekends=exclude_weekends,
+                    exclude_weekends=schedule_exclude_weekends,
                     holiday_codes=holiday_codes,
                     tz=schedule_tz,
                     staleness_factor=staleness_factor,
@@ -246,7 +251,7 @@ def compute_freshness_threshold(
             holiday_dates = resolve_holiday_dates(holiday_codes, history.index) if holiday_codes else None
             schedule_upper = minutes_to_next_deadline(
                 result.last_update, schedule,
-                exclude_weekends, holiday_dates, schedule_tz,
+                schedule_exclude_weekends, holiday_dates, schedule_tz,
                 deadline_buffer, excluded_days=excluded_days,
             )
             if schedule_upper is not None:
 
@@ -0,0 +1,31 @@
+from uuid import UUID, uuid4
+
+from sqlalchemy import Boolean, Column, ForeignKey, String
+from sqlalchemy.dialects import postgresql
+
+from testgen.common.models.entity import Entity
+
+
+class DataColumnChars(Entity):
+    __tablename__ = "data_column_chars"
+
+    id: UUID = Column("column_id", postgresql.UUID(as_uuid=True), primary_key=True, default=uuid4)
+    table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id"))
+    schema_name: str = Column(String)
+    table_name: str = Column(String)
+    column_name: str = Column(String)
+    excluded_data_element: bool | None = Column(Boolean, nullable=True)
+    pii_flag: str | None = Column(String(50), nullable=True)
+
+    _default_order_by = (id,)
+
+    # Unmapped columns: table_id, ordinal_position, general_type, column_type,
+    # db_data_type, functional_data_type, description, critical_data_element,
+    # data_source, source_system, source_process, business_domain,
+    # stakeholder_group, transform_level, aggregation_level, data_product,
+    # add_date, last_mod_date, drop_date, test_ct, last_test_date,
+    # tests_last_run, tests_7_days_prior, tests_30_days_prior,
+    # fails_last_run, fails_7_days_prior, fails_30_days_prior,
+    # warnings_last_run, warnings_7_days_prior, warnings_30_days_prior,
+    # last_complete_profile_run_id, valid_profile_issue_ct,
+    # valid_test_issue_ct, dq_score_profiling, dq_score_testing
@@ -28,6 +28,9 @@ class TableGroupMinimal(EntityMinimal):
     profile_use_sampling: bool
     profiling_delay_days: str
     monitor_test_suite_id: UUID | None
+    profile_flag_cdes: bool
+    profile_flag_pii: bool
+    profile_exclude_xde: bool
     last_complete_profile_run_id: UUID | None
 
 
@@ -112,6 +115,8 @@ class TableGroup(Entity):
     profile_sample_min_count: int = Column(BigInteger, default=100000)
     profiling_delay_days: str = Column(String, default="0")
     profile_flag_cdes: bool = Column(Boolean, default=True)
+    profile_flag_pii: bool = Column(Boolean, default=True)
+    profile_exclude_xde: bool = Column(Boolean, default=True)
     profile_do_pair_rules: bool = Column(YNString, default="N")
     profile_pair_rule_pct: int = Column(Integer, default=95)
     include_in_dashboard: bool = Column(Boolean, default=True)
 
@@ -0,0 +1,96 @@
+"""PII masking utilities for redacting sensitive data in the UI."""
+import pandas as pd
+
+from testgen.ui.services.database_service import fetch_all_from_db
+
+PII_REDACTED = "[PII Redacted]"
+
+PROFILING_PII_FIELDS = (
+    "top_freq_values", "min_text", "max_text",
+    "min_value", "min_value_over_0", "max_value",
+    "min_date", "max_date",
+)
+
+
+def get_pii_columns(table_group_id: str, schema: str | None = None, table_name: str | None = None) -> set[str]:
+    """Look up PII-flagged column names from data_column_chars."""
+
+    query = f"""
+    SELECT column_name
+    FROM data_column_chars
+    WHERE table_groups_id = :table_group_id
+        AND pii_flag IS NOT NULL
+        {"AND schema_name = :schema" if schema else ""}
+        {"AND table_name = :table_name" if table_name else ""}
+    """
+    params: dict = {
+        "table_group_id": table_group_id,
+        "schema": schema,
+        "table_name": table_name,
+    }
+
+    results = fetch_all_from_db(query, params)
+    return {row.column_name for row in results}
+
+
+def mask_source_data_pii(df: pd.DataFrame, pii_columns: set[str]) -> None:
+    """In-place mask values in PII columns with PII_REDACTED."""
+    if df.empty or not pii_columns:
+        return
+    for col in pii_columns:
+        # Match case-insensitively since column names may differ in case
+        for df_col in df.columns:
+            if df_col.lower() == col.lower():
+                df[df_col] = PII_REDACTED
+
+
+def mask_hygiene_detail(data: pd.DataFrame | list[dict], pii_columns: set[str] | None = None) -> None:
+    """Redact hygiene issue detail for PII columns where detail_redactable is true.
+
+    Accepts:
+    - DataFrame with detail_redactable, pii_flag, and detail columns (hygiene issues grid/export)
+    - List of issue dicts, each with detail_redactable and either pii_flag or column_name
+      (when pii_columns set is provided, matches column_name against it)
+    """
+    if isinstance(data, pd.DataFrame):
+        if data.empty or "detail_redactable" not in data.columns:
+            return
+        pii_mask = data["detail_redactable"].fillna(False) & data["pii_flag"].notna()
+        data.loc[pii_mask, "detail"] = PII_REDACTED
+        return
+
+    if not data:
+        return
+    pii_lower = {c.lower() for c in pii_columns} if pii_columns else None
+    for issue in data:
+        if not issue.get("detail_redactable"):
+            continue
+        if pii_lower is not None:
+            if issue.get("column_name", "").lower() in pii_lower:
+                issue["detail"] = PII_REDACTED
+        elif issue.get("pii_flag"):
+            issue["detail"] = PII_REDACTED
+
+
+def mask_profiling_pii(data: pd.DataFrame | dict, pii_columns: set[str]) -> None:
+    """Mask profiling fields for PII columns. Accepts a DataFrame or a single-row dict."""
+    if isinstance(data, dict):
+        if not pii_columns:
+            return
+        column_name = data.get("column_name")
+        if column_name and column_name.lower() not in {c.lower() for c in pii_columns}:
+            return
+        for field in PROFILING_PII_FIELDS:
+            if field in data:
+                data[field] = PII_REDACTED
+        return
+
+    if data.empty or not pii_columns:
+        return
+    pii_lower = {c.lower() for c in pii_columns}
+    mask = data["column_name"].str.lower().isin(pii_lower)
+    for field in PROFILING_PII_FIELDS:
+        if field in data.columns:
+            if data[field].dtype != object:
+                data[field] = data[field].astype(object)
+            data.loc[mask, field] = PII_REDACTED
@@ -113,6 +113,8 @@ CREATE TABLE table_groups
     profile_sample_min_count BIGINT DEFAULT 100000,
     profiling_delay_days     VARCHAR(3) DEFAULT '0',
     profile_flag_cdes        BOOLEAN DEFAULT TRUE,
+    profile_flag_pii         BOOLEAN DEFAULT TRUE,
+    profile_exclude_xde      BOOLEAN DEFAULT TRUE,
     profile_do_pair_rules    VARCHAR(3) DEFAULT 'N',
     profile_pair_rule_pct    INTEGER DEFAULT 95,
     include_in_dashboard     BOOLEAN DEFAULT TRUE,
@@ -341,6 +343,7 @@ CREATE TABLE profile_anomaly_types (
    anomaly_description VARCHAR(500),
    anomaly_criteria    VARCHAR(2000),
    detail_expression   VARCHAR(2000),
+   detail_redactable   BOOLEAN DEFAULT FALSE,
    issue_likelihood    VARCHAR(50),  -- Potential, Likely, Certain
    suggested_action    VARCHAR(1000),
    dq_score_prevalence_formula TEXT,
@@ -447,6 +450,8 @@ CREATE TABLE data_column_chars (
    functional_data_type   VARCHAR(50),
    description            VARCHAR(1000),
    critical_data_element  BOOLEAN,
+   excluded_data_element  BOOLEAN,
+   pii_flag               VARCHAR(50),
    data_source            VARCHAR(40),
    source_system          VARCHAR(40),
    source_process         VARCHAR(40),
@@ -609,6 +614,7 @@ CREATE TABLE target_data_lookups (
    sql_flavor   VARCHAR(20)  NOT NULL,
    lookup_type  VARCHAR(10),
    lookup_query VARCHAR,
+   lookup_redactable_columns VARCHAR(100),
    error_type   VARCHAR(30)  NOT NULL,
    CONSTRAINT target_data_lookups_test_id_sql_flavor_error_type_pk
       PRIMARY KEY (test_id, sql_flavor, error_type)
 
@@ -16,6 +16,7 @@ profile_anomaly_types:
   detail_expression: |-
     CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text
                 ELSE 'Top Freq: ' || p.top_freq_values END
+  detail_redactable: true
   issue_likelihood: Likely
   suggested_action: "Review your source data and follow-up with data owners to determine\
     \ whether this data needs to be corrected. "
 
@@ -9,6 +9,7 @@ profile_anomaly_types:
     p.std_pattern_match = 'DELIMITED_DATA'
   detail_expression: |-
     CASE WHEN p.top_freq_values IS NULL THEN 'Min: ' || p.min_text || ', Max: ' || p.max_text ELSE 'Top Freq: ' || p.top_freq_values END
+  detail_redactable: true
   issue_likelihood: Likely
   suggested_action: |-
     Review your source data and follow-up with data consumers to determine the most useful representation of this data.