Skip to content

Commit 80c644b

Browse files
committed
fix(tests): add flavor specific stale table test generation
1 parent 3b3d3f3 commit 80c644b

15 files changed

Lines changed: 366 additions & 32 deletions

File tree

testgen/commands/queries/generate_tests_query.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
from typing import ClassVar, TypedDict
33

44
from testgen.common import CleanSQL, date_service, read_template_sql_file
5-
from testgen.common.database.database_service import get_queries_for_command, replace_params
5+
from testgen.common.database.database_service import replace_params
6+
from testgen.common.read_file import get_template_files
67

78
LOG = logging.getLogger("testgen")
89

@@ -67,11 +68,35 @@ def GetTestTypesSQL(self) -> tuple[str, dict]:
6768

6869
def GetTestDerivationQueriesAsList(self, template_directory: str) -> list[tuple[str, dict]]:
6970
# Runs on App database
70-
params = self._get_params()
71-
queries = get_queries_for_command(template_directory, params)
72-
if self._use_clean:
73-
queries = [ CleanSQL(query) for query in queries ]
74-
return [ (query, params) for query in queries ]
71+
generic_template_directory = template_directory
72+
flavor_template_directory = f"flavors.{self.sql_flavor}.{template_directory}"
73+
74+
query_templates = {}
75+
try:
76+
for query_file in get_template_files(r"^.*sql$", generic_template_directory):
77+
query_templates[query_file.name] = generic_template_directory
78+
except:
79+
LOG.debug(
80+
f"query template '{generic_template_directory}' directory does not exist",
81+
exc_info=True,
82+
stack_info=True,
83+
)
84+
85+
try:
86+
for query_file in get_template_files(r"^.*sql$", flavor_template_directory):
87+
query_templates[query_file.name] = flavor_template_directory
88+
except:
89+
LOG.debug(
90+
f"query template '{generic_template_directory}' directory does not exist",
91+
exc_info=True,
92+
stack_info=True,
93+
)
94+
95+
queries = []
96+
for filename, sub_directory in query_templates.items():
97+
queries.append(self._get_query(filename, sub_directory=sub_directory))
98+
99+
return queries
75100

76101
def GetTestQueriesFromGenericFile(self) -> tuple[str, dict]:
77102
# Runs on App database

testgen/template/dbupgrade/0501_incremental_upgrade.sql renamed to testgen/template/dbupgrade/0147_incremental_upgrade.sql

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,12 @@
11
SET SEARCH_PATH TO {SCHEMA_NAME};
22

3+
ALTER TABLE test_definitions
4+
ADD COLUMN history_calculation VARCHAR(20),
5+
ADD COLUMN history_lookback INTEGER;
6+
7+
ALTER TABLE test_results
8+
ADD COLUMN result_signal VARCHAR(1000);
9+
310
ALTER TABLE test_runs ADD COLUMN log_ct INTEGER;
411

512
DROP VIEW IF EXISTS v_test_results;

testgen/template/dbupgrade/0500_incremental_upgrade.sql

Lines changed: 0 additions & 8 deletions
This file was deleted.
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id,
2+
schema_name, table_name,
3+
skip_errors, test_active, last_auto_gen_date, profiling_as_of_date,
4+
lock_refresh, history_calculation, history_lookback, custom_query )
5+
WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date
6+
FROM profile_results p
7+
INNER JOIN profiling_runs r
8+
ON (p.profile_run_id = r.id)
9+
INNER JOIN test_suites ts
10+
ON p.project_code = ts.project_code
11+
AND p.connection_id = ts.connection_id
12+
WHERE p.project_code = '{PROJECT_CODE}'
13+
AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID
14+
AND ts.id = '{TEST_SUITE_ID}'
15+
AND p.run_date::DATE <= '{AS_OF_DATE}'
16+
GROUP BY r.table_groups_id),
17+
curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
18+
distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct
19+
FROM last_run lr
20+
INNER JOIN profile_results p
21+
ON (lr.table_groups_id = p.table_groups_id
22+
AND lr.last_run_date = p.run_date) ),
23+
locked AS (SELECT schema_name, table_name
24+
FROM test_definitions
25+
WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID
26+
AND test_suite_id = '{TEST_SUITE_ID}'
27+
AND test_type = 'Stale_Table'
28+
AND lock_refresh = 'Y'),
29+
-- IDs - TOP 2
30+
id_cols
31+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
32+
distinct_value_ct,
33+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
34+
ORDER BY
35+
CASE
36+
WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1
37+
WHEN functional_data_type = 'ID-Secondary' THEN 2
38+
ELSE 3
39+
END, distinct_value_ct, column_name DESC) AS rank
40+
FROM curprof
41+
WHERE functional_data_type ILIKE 'ID%'),
42+
-- Process Date - TOP 1
43+
process_date_cols
44+
AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
45+
distinct_value_ct,
46+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
47+
ORDER BY
48+
CASE
49+
WHEN column_name ILIKE '%mod%' THEN 1
50+
WHEN column_name ILIKE '%up%' THEN 1
51+
WHEN column_name ILIKE '%cr%' THEN 2
52+
WHEN column_name ILIKE '%in%' THEN 2
53+
END , distinct_value_ct DESC, column_name) AS rank
54+
FROM curprof
55+
WHERE functional_data_type ILIKE 'process%'),
56+
-- Transaction Date - TOP 1
57+
tran_date_cols
58+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
59+
distinct_value_ct,
60+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
61+
ORDER BY
62+
distinct_value_ct DESC, column_name) AS rank
63+
FROM curprof
64+
WHERE functional_data_type ILIKE 'transactional date%' OR functional_data_type ILIKE 'period%'
65+
OR functional_data_type = 'timestamp' ),
66+
67+
-- Numeric Measures
68+
numeric_cols
69+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
70+
/*
71+
-- Subscores
72+
distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score,
73+
(max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score,
74+
LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score,
75+
stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score,
76+
1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty,
77+
*/
78+
-- Weighted score
79+
(
80+
0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) +
81+
0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
82+
0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) +
83+
0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
84+
0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)))
85+
) AS change_detection_score
86+
FROM curprof
87+
WHERE general_type = 'N'
88+
AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant'))
89+
),
90+
numeric_cols_ranked
91+
AS ( SELECT *,
92+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
93+
ORDER BY change_detection_score DESC, column_name) as rank
94+
FROM numeric_cols
95+
WHERE change_detection_score IS NOT NULL),
96+
combined
97+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order
98+
FROM id_cols
99+
WHERE rank <= 2
100+
UNION ALL
101+
SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order
102+
FROM process_date_cols
103+
WHERE rank = 1
104+
UNION ALL
105+
SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order
106+
FROM tran_date_cols
107+
WHERE rank = 1
108+
UNION ALL
109+
SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order
110+
FROM numeric_cols_ranked
111+
WHERE rank = 1 ),
112+
newtests
113+
AS (SELECT profile_run_id, schema_name, table_name,
114+
'COUNT(*)::STRING || ''|'' || ' ||
115+
STRING_AGG(
116+
REPLACE(
117+
CASE
118+
WHEN general_type = 'D' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING'
119+
WHEN general_type = 'A' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || COUNT(DISTINCT @@@)::STRING || ''|'' || SUM(LENGTH(@@@))::STRING'
120+
WHEN general_type = 'N' THEN 'MIN(@@@)::STRING || ''|'' || MAX(@@@::STRING) || ''|'' || SUM(@@@)::STRING || ''|'' || ROUND(AVG(@@@), 5)::STRING || ''|'' || ROUND(STDDEV(@@@), 5)::STRING'
121+
END,
122+
'@@@', '"' || column_name || '"'),
123+
' || ''|'' || '
124+
ORDER BY element_type, fingerprint_order, column_name) as fingerprint
125+
FROM combined
126+
GROUP BY profile_run_id, schema_name, table_name)
127+
SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id,
128+
n.profile_run_id,
129+
'Stale_Table' AS test_type,
130+
'{TEST_SUITE_ID}' AS test_suite_id,
131+
n.schema_name, n.table_name,
132+
0 as skip_errors, 'Y' as test_active,
133+
134+
'{RUN_DATE}'::TIMESTAMP as last_auto_gen_date,
135+
'{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date,
136+
'N' as lock_refresh,
137+
'Value' as history_calculation,
138+
1 as history_lookback,
139+
fingerprint as custom_query
140+
FROM newtests n
141+
INNER JOIN test_types t
142+
ON ('Stale_Table' = t.test_type
143+
AND 'Y' = t.active)
144+
LEFT JOIN generation_sets s
145+
ON (t.test_type = s.test_type
146+
AND '{GENERATION_SET}' = s.generation_set)
147+
LEFT JOIN locked l
148+
ON (n.schema_name = l.schema_name
149+
AND n.table_name = l.table_name)
150+
WHERE (s.generation_set IS NOT NULL
151+
OR '{GENERATION_SET}' = '')
152+
AND l.schema_name IS NULL;
153+
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
INSERT INTO test_definitions (table_groups_id, profile_run_id, test_type, test_suite_id,
2+
schema_name, table_name,
3+
skip_errors, test_active, last_auto_gen_date, profiling_as_of_date,
4+
lock_refresh, history_calculation, history_lookback, custom_query )
5+
WITH last_run AS (SELECT r.table_groups_id, MAX(run_date) AS last_run_date
6+
FROM profile_results p
7+
INNER JOIN profiling_runs r
8+
ON (p.profile_run_id = r.id)
9+
INNER JOIN test_suites ts
10+
ON p.project_code = ts.project_code
11+
AND p.connection_id = ts.connection_id
12+
WHERE p.project_code = '{PROJECT_CODE}'
13+
AND r.table_groups_id = '{TABLE_GROUPS_ID}'::UUID
14+
AND ts.id = '{TEST_SUITE_ID}'
15+
AND p.run_date::DATE <= '{AS_OF_DATE}'
16+
GROUP BY r.table_groups_id),
17+
curprof AS (SELECT p.profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
18+
distinct_value_ct, record_ct, max_value, min_value, avg_value, stdev_value, null_value_ct
19+
FROM last_run lr
20+
INNER JOIN profile_results p
21+
ON (lr.table_groups_id = p.table_groups_id
22+
AND lr.last_run_date = p.run_date) ),
23+
locked AS (SELECT schema_name, table_name
24+
FROM test_definitions
25+
WHERE table_groups_id = '{TABLE_GROUPS_ID}'::UUID
26+
AND test_suite_id = '{TEST_SUITE_ID}'
27+
AND test_type = 'Stale_Table'
28+
AND lock_refresh = 'Y'),
29+
-- IDs - TOP 2
30+
id_cols
31+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
32+
distinct_value_ct,
33+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
34+
ORDER BY
35+
CASE
36+
WHEN functional_data_type ILIKE 'ID-Unique%' THEN 1
37+
WHEN functional_data_type = 'ID-Secondary' THEN 2
38+
ELSE 3
39+
END, distinct_value_ct, column_name DESC) AS rank
40+
FROM curprof
41+
WHERE functional_data_type ILIKE 'ID%'),
42+
-- Process Date - TOP 1
43+
process_date_cols
44+
AS (SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
45+
distinct_value_ct,
46+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
47+
ORDER BY
48+
CASE
49+
WHEN column_name ILIKE '%mod%' THEN 1
50+
WHEN column_name ILIKE '%up%' THEN 1
51+
WHEN column_name ILIKE '%cr%' THEN 2
52+
WHEN column_name ILIKE '%in%' THEN 2
53+
END , distinct_value_ct DESC, column_name) AS rank
54+
FROM curprof
55+
WHERE functional_data_type ILIKE 'process%'),
56+
-- Transaction Date - TOP 1
57+
tran_date_cols
58+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
59+
distinct_value_ct,
60+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
61+
ORDER BY
62+
distinct_value_ct DESC, column_name) AS rank
63+
FROM curprof
64+
WHERE functional_data_type ILIKE 'transactional date%' OR functional_data_type ILIKE 'period%'
65+
OR functional_data_type = 'timestamp' ),
66+
67+
-- Numeric Measures
68+
numeric_cols
69+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, functional_data_type, general_type,
70+
/*
71+
-- Subscores
72+
distinct_value_ct * 1.0 / NULLIF(record_ct, 0) AS cardinality_score,
73+
(max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS range_score,
74+
LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2)) AS nontriviality_score,
75+
stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1) AS variability_score,
76+
1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)) AS null_penalty,
77+
*/
78+
-- Weighted score
79+
(
80+
0.25 * (distinct_value_ct * 1.0 / NULLIF(record_ct, 0)) +
81+
0.15 * ((max_value - min_value) / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
82+
0.10 * (LEAST(1, LOG(GREATEST(distinct_value_ct, 2))) / LOG(GREATEST(record_ct, 2))) +
83+
0.40 * (stdev_value / NULLIF(ABS(NULLIF(avg_value, 0)), 1)) +
84+
0.10 * (1.0 - (null_value_ct * 1.0 / NULLIF(NULLIF(record_ct, 0), 1)))
85+
) AS change_detection_score
86+
FROM curprof
87+
WHERE general_type = 'N'
88+
AND (functional_data_type ILIKE 'Measure%' OR functional_data_type IN ('Sequence', 'Constant'))
89+
),
90+
numeric_cols_ranked
91+
AS ( SELECT *,
92+
ROW_NUMBER() OVER (PARTITION BY schema_name, table_name
93+
ORDER BY change_detection_score DESC, column_name) as rank
94+
FROM numeric_cols
95+
WHERE change_detection_score IS NOT NULL),
96+
combined
97+
AS ( SELECT profile_run_id, schema_name, table_name, column_name, 'ID' AS element_type, general_type, 10 + rank AS fingerprint_order
98+
FROM id_cols
99+
WHERE rank <= 2
100+
UNION ALL
101+
SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_P' AS element_type, general_type, 20 + rank AS fingerprint_order
102+
FROM process_date_cols
103+
WHERE rank = 1
104+
UNION ALL
105+
SELECT profile_run_id, schema_name, table_name, column_name, 'DATE_T' AS element_type, general_type, 30 + rank AS fingerprint_order
106+
FROM tran_date_cols
107+
WHERE rank = 1
108+
UNION ALL
109+
SELECT profile_run_id, schema_name, table_name, column_name, 'MEAS' AS element_type, general_type, 40 + rank AS fingerprint_order
110+
FROM numeric_cols_ranked
111+
WHERE rank = 1 ),
112+
newtests AS (
113+
SELECT
114+
profile_run_id,
115+
schema_name,
116+
table_name,
117+
'CAST(COUNT(*) AS varchar) + ''|'' + ' || STRING_AGG(
118+
REPLACE(
119+
CASE
120+
WHEN general_type = 'D' THEN 'CAST(MIN(@@@) AS varchar) + ''|'' + MAX(CAST(@@@ AS varchar)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS varchar)'
121+
WHEN general_type = 'A' THEN 'CAST(MIN(@@@) AS varchar) + ''|'' + MAX(CAST(@@@ AS varchar)) + ''|'' + CAST(COUNT(DISTINCT @@@) AS varchar) + ''|'' + CAST(SUM(LEN(@@@)) AS varchar)'
122+
WHEN general_type = 'N' THEN 'CAST(MIN(@@@) AS varchar) + ''|'' + MAX(CAST(@@@ AS varchar)) + ''|'' + CAST(SUM(@@@) AS varchar) + ''|'' + CAST(ROUND(AVG(@@@), 5) AS varchar) + ''|'' + CAST(ROUND(STDEV(@@@), 5) AS varchar)'
123+
END,
124+
'@@@', '"' || column_name || '"'
125+
),
126+
' + ''|'' + '
127+
ORDER BY element_type, fingerprint_order, column_name
128+
) as fingerprint
129+
FROM combined
130+
GROUP BY profile_run_id, schema_name, table_name
131+
)
132+
SELECT '{TABLE_GROUPS_ID}'::UUID as table_groups_id,
133+
n.profile_run_id,
134+
'Stale_Table' AS test_type,
135+
'{TEST_SUITE_ID}' AS test_suite_id,
136+
n.schema_name, n.table_name,
137+
0 as skip_errors, 'Y' as test_active,
138+
139+
'{RUN_DATE}'::TIMESTAMP as last_auto_gen_date,
140+
'{AS_OF_DATE}'::TIMESTAMP as profiling_as_of_date,
141+
'N' as lock_refresh,
142+
'Value' as history_calculation,
143+
1 as history_lookback,
144+
fingerprint as custom_query
145+
FROM newtests n
146+
INNER JOIN test_types t
147+
ON ('Stale_Table' = t.test_type
148+
AND 'Y' = t.active)
149+
LEFT JOIN generation_sets s
150+
ON (t.test_type = s.test_type
151+
AND '{GENERATION_SET}' = s.generation_set)
152+
LEFT JOIN locked l
153+
ON (n.schema_name = l.schema_name
154+
AND n.table_name = l.table_name)
155+
WHERE (s.generation_set IS NOT NULL
156+
OR '{GENERATION_SET}' = '')
157+
AND l.schema_name IS NULL;
158+

0 commit comments

Comments
 (0)