Skip to content

Commit acc469f

Browse files
author
ci bot
committed
Merge branch 'fix-databricks-qa' into 'enterprise'
fix: Databricks QA issues See merge request dkinternal/testgen/dataops-testgen!186
2 parents 8841a4f + c9ec254 commit acc469f

2 files changed

Lines changed: 14 additions & 13 deletions

File tree

testgen/template/dbsetup/050_populate_new_schema_metadata.sql

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,7 +1240,7 @@ WHERE {SUBSET_CONDITION}
12401240
('1271', '1100', 'Profile Anomaly', 'Potential_PII', 'mssql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
12411241
('1272', '1100', 'Profile Anomaly', 'Potential_PII', 'postgresql', NULL, 'SELECT "{COLUMN_NAME}", COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY "{COLUMN_NAME}" ORDER BY "{COLUMN_NAME}" DESC LIMIT 500;'),
12421242

1243-
('1273', '1001', 'Profile Anomaly' , 'Suggested_Type', 'databricks', NULL, 'SELECT TOP 20 `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC;'),
1243+
('1273', '1001', 'Profile Anomaly' , 'Suggested_Type', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
12441244
('1274', '1002', 'Profile Anomaly' , 'Non_Standard_Blanks', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;'),
12451245
('1275', '1003', 'Profile Anomaly' , 'Invalid_Zip_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` LIMIT 500;'),
12461246
('1276', '1004', 'Profile Anomaly' , 'Multiple_Types_Minor', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name, CASE WHEN data_type ILIKE ''timestamp%'' THEN lower(data_type) WHEN data_type ILIKE ''date'' THEN lower(data_type) WHEN data_type ILIKE ''boolean'' THEN ''boolean'' WHEN data_type = ''TEXT'' THEN ''varchar('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type ILIKE ''char%'' THEN ''char('' || CAST(character_maximum_length AS STRING) || '')'' WHEN data_type = ''NUMBER'' AND numeric_precision = 38 AND numeric_scale = 0 THEN ''bigint'' WHEN data_type ILIKE ''num%'' THEN ''numeric('' || CAST(numeric_precision AS STRING) || '','' || CAST(numeric_scale AS STRING) || '')'' ELSE data_type END AS data_type FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND tables.table_type = ''BASE TABLE'' ORDER BY data_type, table_name;'),
@@ -1250,21 +1250,21 @@ WHERE {SUBSET_CONDITION}
12501250
('1280', '1008', 'Profile Anomaly' , 'Table_Pattern_Mismatch', 'databricks', NULL, 'SELECT DISTINCT column_name, columns.table_name FROM information_schema.columns JOIN information_schema.tables ON columns.table_name = tables.table_name AND columns.table_schema = tables.table_schema WHERE columns.table_schema = ''{TARGET_SCHEMA}'' AND columns.column_name = ''{COLUMN_NAME}'' AND UPPER(tables.table_type) = ''BASE TABLE'' ORDER BY table_name; ' ),
12511251
('1281', '1009', 'Profile Anomaly' , 'Leading_Spaces', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` BETWEEN '' !'' AND ''!'' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ),
12521252
('1282', '1010', 'Profile Anomaly' , 'Quoted_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` ILIKE ''"%"'' OR `{COLUMN_NAME}` ILIKE ''''''%'''''' THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ),
1253-
('1283', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ),
1254-
('1284', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT TOP 10 ''Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC;' ),
1253+
('1283', '1011', 'Profile Anomaly' , 'Char_Column_Number_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
1254+
('1284', '1012', 'Profile Anomaly' , 'Char_Column_Date_Values', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Date'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_DATE;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
12551255
('1285', '1013', 'Profile Anomaly' , 'Small Missing Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE (CASE WHEN `{COLUMN_NAME}` IN (''.'', ''?'', '' '') THEN 1 WHEN LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''-{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''0{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''9{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''x{2,}'' OR LOWER(`{COLUMN_NAME}`::STRING) REGEXP ''z{2,}'' THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''blank'',''error'',''missing'',''tbd'', ''n/a'',''#na'',''none'',''null'',''unknown'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''(blank)'',''(error)'',''(missing)'',''(tbd)'', ''(n/a)'',''(#na)'',''(none)'',''(null)'',''(unknown)'') THEN 1 WHEN LOWER(`{COLUMN_NAME}`) IN (''[blank]'',''[error]'',''[missing]'',''[tbd]'', ''[n/a]'',''[#na]'',''[none]'',''[null]'',''[unknown]'') THEN 1 WHEN `{COLUMN_NAME}` = '''' THEN 1 WHEN `{COLUMN_NAME}` IS NULL THEN 1 ELSE 0 END) = 1 GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}`;' ),
1256-
('1286', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC;' ),
1257-
('1287', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC;' ),
1258-
('1288', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING COUNT(*)> 1 ORDER BY COUNT(*) DESC LIMIT 500;' ),
1256+
('1286', '1014', 'Profile Anomaly' , 'Small Divergent Value Ct', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ),
1257+
('1287', '1015', 'Profile Anomaly' , 'Boolean_Value_Mismatch', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY count DESC;' ),
1258+
('1288', '1016', 'Profile Anomaly' , 'Potential_Duplicates', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` HAVING count > 1 ORDER BY count DESC LIMIT 500;' ),
12591259
('1289', '1017', 'Profile Anomaly' , 'Standardized_Value_Matches', 'databricks', NULL, 'WITH CTE AS ( SELECT DISTINCT UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) as possible_standard_value, COUNT(DISTINCT `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY UPPER(TRANSLATE(`{COLUMN_NAME}`, '' '''',.-'', '''')) HAVING COUNT(DISTINCT `{COLUMN_NAME}`) > 1 ) SELECT DISTINCT a.`{COLUMN_NAME}`, possible_standard_value, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a, cte b WHERE UPPER(TRANSLATE(a.`{COLUMN_NAME}`, '' '''',.-'', '''')) = b.possible_standard_value GROUP BY a.`{COLUMN_NAME}`, possible_standard_value ORDER BY possible_standard_value ASC, count DESC LIMIT 500;' ),
12601260
('1290', '1018', 'Profile Anomaly' , 'Unlikely_Date_Values', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, ''{PROFILE_RUN_DATE}'' :: DATE AS profile_run_date, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} a WHERE (`{COLUMN_NAME}` < ''1900-01-01''::DATE) OR (`{COLUMN_NAME}` > ''{PROFILE_RUN_DATE}'' :: DATE + INTERVAL ''30 year'' ) GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
12611261
('1291', '1019', 'Profile Anomaly' , 'Recency_One_Year', 'databricks', NULL, 'created_in_ui' ),
12621262
('1292', '1020', 'Profile Anomaly' , 'Recency_Six_Months', 'databricks', NULL, 'created_in_ui' ),
12631263
('1293', '1021', 'Profile Anomaly' , 'Unexpected US States', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
12641264
('1294', '1022', 'Profile Anomaly' , 'Unexpected Emails', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} GROUP BY `{COLUMN_NAME}` ORDER BY `{COLUMN_NAME}` DESC LIMIT 500;' ),
1265-
('1295', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT TOP 10 ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
1265+
('1295', '1023', 'Profile Anomaly' , 'Small_Numeric_Value_Ct', 'databricks', NULL, 'SELECT A.* FROM (SELECT DISTINCT ''Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> = 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 10) AS A UNION ALL SELECT B.* FROM (SELECT DISTINCT ''Non-Numeric'' as data_type, `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE <%IS_NUM;`{COLUMN_NAME}`%> != 1 GROUP BY `{COLUMN_NAME}` ORDER BY count DESC) AS B ORDER BY data_type, count DESC LIMIT 10;' ),
12661266
('1296', '1024', 'Profile Anomaly' , 'Invalid_Zip3_USA', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') <> ''999'' GROUP BY `{COLUMN_NAME}` ORDER BY count DESC, `{COLUMN_NAME}` LIMIT 500;'),
1267-
('1297', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY `{COLUMN_NAME}` ORDER BY COUNT(*) DESC LIMIT 500;' ),
1267+
('1297', '1025', 'Profile Anomaly' , 'Delimited_Data_Embedded', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''^([^,|\t]{1,20}[,|\t]){2,}[^,|\t]{0,20}([,|\t]{0,1}[^,|\t]{0,20})*$'') AND NOT REGEXP_LIKE(`{COLUMN_NAME}`::STRING, ''.*\\s(and|but|or|yet)\\s.*'') GROUP BY `{COLUMN_NAME}` ORDER BY count DESC LIMIT 500;' ),
12681268

12691269
('1298', '1004', 'Test Results', 'Alpha_Trunc', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}` , LEN(`{COLUMN_NAME}`) as current_max_length, {THRESHOLD_VALUE} as previous_max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}, (SELECT MAX(LEN(`{COLUMN_NAME}`)) as max_length FROM {TARGET_SCHEMA}.{TABLE_NAME}) a WHERE LEN(`{COLUMN_NAME}`) = a.max_length AND a.max_length < {THRESHOLD_VALUE} LIMIT 500;'),
12701270
('1299', '1005', 'Test Results', 'Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
@@ -1278,7 +1278,7 @@ WHERE {SUBSET_CONDITION}
12781278
('1307', '1015', 'Test Results', 'Future_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - ''{TEST_DATE}''::DATE)) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
12791279
('1308', '1016', 'Test Results', 'Future_Date_1Y', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE GREATEST(0, SIGN(`{COLUMN_NAME}`::DATE - (''{TEST_DATE}''::DATE + 365))) > {THRESHOLD_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
12801280
('1309', '1017', 'Test Results', 'Incr_Avg_Shift', 'databricks', NULL, 'SELECT AVG(`{COLUMN_NAME}` :: FLOAT) AS current_average, SUM(`{COLUMN_NAME}` ::FLOAT) AS current_sum, NULLIF(COUNT(`{COLUMN_NAME}` )::FLOAT, 0) as current_value_count FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
1281-
('1310', '1018', 'Test Results', 'LOV_All', 'databricks', NULL, 'SELECT LISTAGG(DISTINCT `{COLUMN_NAME}`, ''|'') WITHIN GROUP (ORDER BY `{COLUMN_NAME}`) FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING LISTAGG(DISTINCT `{COLUMN_NAME}`, ''|'') WITHIN GROUP (ORDER BY `{COLUMN_NAME}`) <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
1281+
('1310', '1018', 'Test Results', 'LOV_All', 'databricks', NULL, 'SELECT ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') AS aggregated_values FROM {TARGET_SCHEMA}.{TABLE_NAME} HAVING ARRAY_JOIN(ARRAY_SORT(COLLECT_SET(`{COLUMN_NAME}`)), ''|'') <> ''{THRESHOLD_VALUE}'' LIMIT 500;'),
12821282
('1311', '1019', 'Test Results', 'LOV_Match', 'databricks', NULL, 'SELECT DISTINCT NULLIF(`{COLUMN_NAME}`, '''') AS `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE NULLIF(`{COLUMN_NAME}`, '''') NOT IN {BASELINE_VALUE} GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
12831283
('1312', '1020', 'Test Results', 'Min_Date', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` :: DATE < ''{BASELINE_VALUE}'' :: DATE GROUP BY `{COLUMN_NAME}` LIMIT 500;'),
12841284
('1313', '1021', 'Test Results', 'Min_Val', 'databricks', NULL, 'SELECT DISTINCT `{COLUMN_NAME}`, (ABS(`{COLUMN_NAME}`) - ABS({BASELINE_VALUE})) AS difference_from_baseline FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE `{COLUMN_NAME}` < {BASELINE_VALUE} LIMIT 500;'),
@@ -1299,7 +1299,7 @@ WHERE {SUBSET_CONDITION}
12991299
('1328', '1040', 'Test Results', 'Variability_Increase', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
13001300
('1329', '1041', 'Test Results', 'Variability_Decrease', 'databricks', NULL, 'SELECT STDDEV(CAST(`{COLUMN_NAME}` AS FLOAT)) as current_standard_deviation FROM {TARGET_SCHEMA}.{TABLE_NAME};'),
13011301

1302-
('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE lower(`{COLUMN_NAME}`) IN (SELECT trim(value) FROM TABLE (FLATTEN(INPUT => SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', POSITION('':'', ''{DETAIL_EXPRESSION}'') + 2), ''|''))) ) GROUP BY `{COLUMN_NAME}`;'),
1302+
('1230', '1027', 'Profile Anomaly' , 'Variant_Coded_Values', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS count FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE LOWER(`{COLUMN_NAME}`) IN (SELECT TRIM(value) FROM (SELECT EXPLODE(SPLIT(SUBSTRING(''{DETAIL_EXPRESSION}'', INSTR(''{DETAIL_EXPRESSION}'', '':'') + 2), ''\\|'')) AS value)) GROUP BY `{COLUMN_NAME}`;'),
13031303
('1330', '1043', 'Test Results', 'Valid_Characters', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE REGEXP_LIKE(`{COLUMN_NAME}`, ''.*[[:cntrl:]].*'') OR `{COLUMN_NAME}`::STRING LIKE '' %'' OR `{COLUMN_NAME}`::STRING LIKE ''''''%'''''' OR `{COLUMN_NAME}`::STRING LIKE ''"%"'' GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
13041304
('1331', '1044', 'Test Results', 'Valid_US_Zip', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),
13051305
('1332', '1045', 'Test Results', 'Valid_US_Zip3', 'databricks', NULL, 'SELECT `{COLUMN_NAME}`, COUNT(*) AS record_ct FROM {TARGET_SCHEMA}.{TABLE_NAME} WHERE TRANSLATE(`{COLUMN_NAME}`,''012345678'',''999999999'') NOT IN (''99999'', ''999999999'', ''99999-9999'') GROUP BY `{COLUMN_NAME}` ORDER BY record_ct DESC LIMIT 20;'),

testgen/ui/services/hygiene_issues_service.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def get_source_data(hi_data):
2222
AND t.lookup_query > '';
2323
"""
2424

25-
def get_lookup_query(test_id, detail_exp, column_names):
25+
def get_lookup_query(test_id, detail_exp, column_names, sql_flavor):
2626
if test_id in {"1019", "1020"}:
2727
start_index = detail_exp.find("Columns: ")
2828
if start_index == -1:
@@ -31,8 +31,9 @@ def get_lookup_query(test_id, detail_exp, column_names):
3131
start_index += len("Columns: ")
3232
column_names_str = detail_exp[start_index:]
3333
columns = [col.strip() for col in column_names_str.split(",")]
34+
quote = "`" if sql_flavor == "databricks" else '"'
3435
queries = [
35-
f"SELECT '{column}' AS column_name, MAX({column}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}"
36+
f"SELECT '{column}' AS column_name, MAX({quote}{column}{quote}) AS max_date_available FROM {{TARGET_SCHEMA}}.{{TABLE_NAME}}"
3637
for column in columns
3738
]
3839
sql_query = " UNION ALL ".join(queries) + " ORDER BY max_date_available DESC;"
@@ -42,7 +43,7 @@ def get_lookup_query(test_id, detail_exp, column_names):
4243

4344
def replace_parms(str_query):
4445
str_query = (
45-
get_lookup_query(hi_data["anomaly_id"], hi_data["detail"], hi_data["column_name"])
46+
get_lookup_query(hi_data["anomaly_id"], hi_data["detail"], hi_data["column_name"], lst_query[0]["sql_flavor"])
4647
if lst_query[0]["lookup_query"] == "created_in_ui"
4748
else lst_query[0]["lookup_query"]
4849
)

0 commit comments

Comments
 (0)