|
| 1 | +import re |
| 2 | +from collections.abc import Iterable |
| 3 | +from dataclasses import dataclass |
| 4 | +from typing import Self |
| 5 | +from uuid import UUID, uuid4 |
| 6 | + |
| 7 | +from sqlalchemy import Column, ForeignKey, String, and_, case, null, select |
| 8 | +from sqlalchemy.dialects import postgresql |
| 9 | +from sqlalchemy.ext.hybrid import hybrid_property |
| 10 | +from sqlalchemy.orm import aliased, relationship |
| 11 | +from sqlalchemy.sql.functions import func |
| 12 | + |
| 13 | +from testgen.common.models import Base, get_current_session |
| 14 | +from testgen.common.models.entity import Entity |
| 15 | + |
| 16 | +PII_RISK_RE = re.compile(r"Risk: (MODERATE|HIGH),") |
| 17 | + |
| 18 | + |
| 19 | +@dataclass |
| 20 | +class IssueCount: |
| 21 | + total: int = 0 |
| 22 | + inactive: int = 0 |
| 23 | + |
| 24 | + @property |
| 25 | + def active(self): |
| 26 | + return self.total - self.inactive |
| 27 | + |
| 28 | + |
| 29 | +class HygieneIssueType(Base): |
| 30 | + __tablename__ = "profile_anomaly_types" |
| 31 | + |
| 32 | + id: str = Column(String, primary_key=True) |
| 33 | + likelihood: str = Column("issue_likelihood", String) |
| 34 | + name: str = Column("anomaly_name", String) |
| 35 | + |
| 36 | + # Note: not all table columns are implemented by this entity |
| 37 | + |
| 38 | + |
| 39 | +class HygieneIssue(Entity): |
| 40 | + __tablename__ = "profile_anomaly_results" |
| 41 | + |
| 42 | + id: UUID = Column(postgresql.UUID(as_uuid=True), primary_key=True, nullable=False, default=uuid4) |
| 43 | + |
| 44 | + project_code: str = Column(String, ForeignKey("projects.project_code")) |
| 45 | + table_groups_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("table_groups.id"), nullable=False) |
| 46 | + profile_run_id: UUID = Column(postgresql.UUID(as_uuid=True), ForeignKey("profiling_runs.id"), nullable=False) |
| 47 | + |
| 48 | + type_id: str = Column("anomaly_id", String, ForeignKey("profile_anomaly_types.id"), nullable=False) |
| 49 | + type_ = relationship(HygieneIssueType) |
| 50 | + |
| 51 | + schema_name: str = Column(String, nullable=False) |
| 52 | + table_name: str = Column(String, nullable=False) |
| 53 | + column_name: str = Column(String, nullable=False) |
| 54 | + |
| 55 | + detail: str = Column(String, nullable=False) |
| 56 | + disposition: str = Column(String) |
| 57 | + |
| 58 | + # Note: not all table columns are implemented by this entity |
| 59 | + |
| 60 | + @hybrid_property |
| 61 | + def priority(self): |
| 62 | + if self.type_.likelihood != "Potential PII": |
| 63 | + return self.type_.likelihood |
| 64 | + elif self.detail and (match := PII_RISK_RE.search(self.detail)): |
| 65 | + return match.group(1).capitalize() |
| 66 | + else: |
| 67 | + return None |
| 68 | + |
| 69 | + @priority.expression |
| 70 | + def priority(cls): |
| 71 | + return case( |
| 72 | + ( |
| 73 | + HygieneIssueType.likelihood != "Potential PII", |
| 74 | + HygieneIssueType.likelihood, |
| 75 | + ), |
| 76 | + else_=func.initcap( |
| 77 | + func.substring(cls.detail, PII_RISK_RE.pattern) |
| 78 | + ), |
| 79 | + ) |
| 80 | + |
| 81 | + @classmethod |
| 82 | + def select_count_by_priority(cls, profiling_run_id: UUID) -> dict[str, IssueCount]: |
| 83 | + count_query = ( |
| 84 | + select( |
| 85 | + cls.priority, |
| 86 | + func.count(), |
| 87 | + func.count(cls.disposition.in_(("Dismissed", "Inactive"))), |
| 88 | + ) |
| 89 | + .select_from(cls) |
| 90 | + .join(HygieneIssueType) |
| 91 | + .where(cls.profile_run_id == profiling_run_id) |
| 92 | + .group_by(cls.priority) |
| 93 | + ) |
| 94 | + result = { |
| 95 | + priority: IssueCount(total, inactive) |
| 96 | + for priority, total, inactive in get_current_session().execute(count_query) |
| 97 | + } |
| 98 | + for p in ("Definite", "Likely", "Possible", "High", "Moderate"): |
| 99 | + result.setdefault(p, IssueCount()) |
| 100 | + return result |
| 101 | + |
| 102 | + @classmethod |
| 103 | + def select_with_diff( |
| 104 | + cls, profiling_run_id: UUID, other_profiling_run_id: UUID | None, *where_clauses, limit: int | None = None |
| 105 | + ) -> Iterable[tuple[Self,bool,str]]: |
| 106 | + other = aliased(cls) |
| 107 | + order_weight = case( |
| 108 | + (cls.priority == "Definite", 1), |
| 109 | + (cls.priority == "Likely", 2), |
| 110 | + (cls.priority == "Possible", 3), |
| 111 | + (cls.priority == "High", 4), |
| 112 | + (cls.priority == "Moderate", 5), |
| 113 | + else_=6, |
| 114 | + ) |
| 115 | + is_new_col = (other.id.is_(None) if other_profiling_run_id else null()).label("is_new") |
| 116 | + query = ( |
| 117 | + select( |
| 118 | + cls, |
| 119 | + is_new_col, |
| 120 | + ) |
| 121 | + .outerjoin( |
| 122 | + other, |
| 123 | + and_( |
| 124 | + other.table_groups_id == cls.table_groups_id, |
| 125 | + other.schema_name == cls.schema_name, |
| 126 | + other.table_name == cls.table_name, |
| 127 | + other.column_name == cls.column_name, |
| 128 | + other.type_id == cls.type_id, |
| 129 | + other.profile_run_id == other_profiling_run_id, |
| 130 | + ), |
| 131 | + ).join( |
| 132 | + HygieneIssueType, |
| 133 | + HygieneIssueType.id == cls.type_id, |
| 134 | + ).where( |
| 135 | + cls.profile_run_id == profiling_run_id, |
| 136 | + *where_clauses |
| 137 | + ).order_by( |
| 138 | + is_new_col.desc(), |
| 139 | + order_weight, |
| 140 | + ).limit( |
| 141 | + limit, |
| 142 | + ) |
| 143 | + ) |
| 144 | + |
| 145 | + return get_current_session().execute(query) |
0 commit comments