Merge branch 'fix-req-entity-test' into 'enterprise'

ci bot · ci bot · commit 79b19811a049 · 2025-07-15T14:44:00.000Z
fix: Do not generate Required Entity tests for empty tables

See merge request dkinternal/testgen/dataops-testgen!255
diff --git a/testgen/template/dbsetup/050_populate_new_schema_metadata.sql b/testgen/template/dbsetup/050_populate_new_schema_metadata.sql
@@ -123,7 +123,7 @@ VALUES  ('1004', 'Alpha_Trunc', 'Alpha Truncation', 'Maximum character count con
         ('1025', 'Outlier_Pct_Below', 'Outliers Below', 'Consistent outlier counts under 2 SD below mean', 'Tests that percent of outliers over 2 SD below Mean doesn''t exceed threshold', 'Percent of outliers exceeding 2 SD below the mean is greater than expected threshold.', 'Pct records under limit', NULL, 'functional_data_type = ''Measurement'' AND distinct_value_ct > 30 AND NOT distinct_value_ct = max_value - min_value + 1 AND distinct_value_ct::FLOAT/value_ct::FLOAT > 0.1 AND stdev_value::FLOAT/avg_value::FLOAT > 0.01 AND column_name NOT ILIKE ''%latitude%'' AND column_name NOT ilike ''%longitude%''', 'GREATEST(0, {RESULT_MEASURE}::FLOAT-{THRESHOLD_VALUE}::FLOAT)', '0.75', NULL, NULL, 'baseline_avg,baseline_sd,threshold_value', 'avg_value,stdev_value,0.05', 'Baseline Mean, Baseline Std Deviation, Pct Records over 2 SD', NULL, 'Warning', 'CAT', 'column', 'Accuracy', 'Data Drift', 'Expected maximum pct records over lower 2 SD limit', 'This test counts the number of data points that may be considered as outliers, determined by whether their value exceeds 2 standard deviations below the mean at baseline.  Assuming a normal distribution, a small percentage (defaulted to 5%) of outliers is expected. The actual number may vary for different distributions. The expected threshold reflects the maximum percentage of outliers you expect to see.  This test uses the baseline mean rather than the mean for the latest dataset to capture systemic shift as well as individual outliers. ', 'Y'),
         ('1026', 'Pattern_Match', 'Pattern Match', 'Column values match alpha-numeric pattern', 'Tests that all values in the column match the same alpha-numeric pattern identified in baseline data', 'Alpha values do not match consistent pattern in baseline.', 'Pattern Mismatches', NULL, '(functional_data_type IN (''Attribute'', ''DateTime Stamp'', ''Phone'') OR functional_data_type ILIKE ''ID%'' OR functional_data_type ILIKE ''Period%'') AND fn_charcount(top_patterns, E'' \| '' ) = 1 AND REPLACE(SPLIT_PART(top_patterns, ''|'' , 2), ''N'' , '''' ) > '''' AND distinct_value_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'baseline_value,threshold_value', 'TRIM(REPLACE(REPLACE(REPLACE(REGEXP_REPLACE(SPLIT_PART(top_patterns, '' | '', 2), ''([*+\-%_])'', ''[\1]'', ''g''), ''A'', ''[A-Z]''), ''N'', ''[0-9]''), ''a'', ''[a-z]'')),0', 'Pattern at Baseline,Threshold Error Count', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected count of pattern mismatches', 'This test is appropriate for character fields that are expected to appear in a consistent format. It uses pattern matching syntax as appropriate for your database:  REGEX matching if available, otherwise LIKE expressions. The expected threshold is the number of records that fail to match the defined pattern.', 'Y'),
         ('1028', 'Recency', 'Recency', 'Latest date within expected range of test date', 'Tests that the latest date in column is within a set number of days of the test date', 'Most recent date value not within expected days of test date.', 'Days before test', 'Number of days that most recent date precedes the date of test', 'general_type= ''D'' AND max_date <= run_date AND NOT column_name IN ( ''filedate'' , ''file_date'' ) AND NOT functional_data_type IN (''Future Date'', ''Schedule Date'') AND DATEDIFF( ''DAY'' , max_date, run_date) <= 62', '(ABS({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT*{PRO_RECORD_CT}::FLOAT/(1.0+DATEDIFF(''DAY'', ''{MIN_DATE}'', ''{MAX_DATE}''))::FLOAT)/NULLIF({RECORD_CT}::FLOAT, 0)', '0.75', NULL, NULL, 'threshold_value', 'CASE WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 3 THEN DATEDIFF(''DAY'', max_date, run_date) + 3 WHEN DATEDIFF(''DAY'', max_date, run_date) <= 7 then DATEDIFF(''DAY'', max_date, run_date) + 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) <= 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 7.0) * 7 WHEN DATEDIFF( ''DAY'' , max_date, run_date) > 31 THEN CEILING( DATEDIFF( ''DAY'' , max_date, run_date)::FLOAT / 30.0) * 30 END', 'Threshold Maximum Days before Test', NULL, 'Warning', 'CAT', 'column', 'Timeliness', 'Recency', 'Expected maximum count of days preceding test date', 'This test evaluates recency based on the latest referenced dates in the column.  The test is appropriate for transactional dates and timestamps.  The test can be especially valuable because timely data deliveries themselves may not assure that the most recent data is present. You can adjust the expected threshold to the maximum number of days that you expect the data to age before the dataset is refreshed.  ', 'Y'),
-        ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'),
+        ('1030', 'Required', 'Required Entry', 'Required non-null value present', 'Tests that a non-null value is present in each record for the column, consistent with baseline data', 'Every record for this column is expected to be filled, but some are missing.', 'Missing values', NULL, 'record_ct = value_ct AND record_ct > 10', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Missing Value Count', NULL, 'Fail', 'CAT', 'column', 'Completeness', 'Schema Drift', 'Expected count of missing values', NULL, 'Y'),
         ('1033', 'Street_Addr_Pattern', 'Street Address', 'Enough street address entries match defined pattern', 'Tests for percent of records matching standard street address pattern.', 'Percent of values matching standard street address format is under expected threshold.', 'Percent matches', 'Percent of records that match street address pattern', '(std_pattern_match=''STREET_ADDR'') AND (avg_length <> round(avg_length)) AND (avg_embedded_spaces BETWEEN 2 AND 6) AND (avg_length < 35)', '({VALUE_CT}::FLOAT * ({RESULT_MEASURE}::FLOAT - {THRESHOLD_VALUE}::FLOAT)/100.0)/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '75', 'Threshold Pct that Match Address Pattern', NULL, 'Fail', 'CAT', 'column', 'Validity', 'Schema Drift', 'Expected percent of records that match standard street address pattern', 'The street address pattern used in this test should match the vast majority of USA addresses.  You can adjust the threshold percent of matches based on the results you are getting -- you may well want to tighten it to make the test more sensitive to invalid entries.', 'Y'),
         ('1034', 'Unique', 'Unique Values', 'Each column value is unique', 'Tests that no values for the column are repeated in multiple records.', 'Column values should be unique per row.', 'Duplicate values', 'Count of non-unique values', 'record_ct > 500 and record_ct = distinct_value_ct and value_ct > 0', '({RESULT_MEASURE}-{THRESHOLD_VALUE})::FLOAT/NULLIF({RECORD_CT}::FLOAT, 0)', '1.0', NULL, NULL, 'threshold_value', '0', 'Threshold Duplicate Value Count', NULL, 'Fail', 'CAT', 'column', 'Uniqueness', 'Schema Drift', 'Expected count of duplicate values', 'This test is ideal when the database itself does not enforce a primary key constraint on the table. It serves as an independent check on uniqueness.  If''s also useful when there are a small number of exceptions to uniqueness, which can be reflected in the expected threshold count of duplicates.', 'Y'),
         ('1035', 'Unique_Pct', 'Percent Unique', 'Consistent ratio of unique values', 'Tests for statistically-significant shift in percentage of unique values vs. baseline data.', 'Significant shift in percent of unique values vs. baseline.', 'Difference measure', 'Cohen''s H Difference (0.20 small, 0.5 mod, 0.8 large, 1.2 very large, 2.0 huge)', 'distinct_value_ct > 10 AND functional_data_type NOT ILIKE ''Measurement%''', '2.0 * (1.0 - fn_normal_cdf(ABS({RESULT_MEASURE}::FLOAT) / 2.0))', '0.75', NULL, NULL, 'baseline_value_ct,baseline_unique_ct,threshold_value', 'value_ct,distinct_value_ct,0.5', 'Value Count at Baseline,Distinct Value Count at Baseline,Standardized Difference Measure (0 to 1)', NULL, 'Warning', 'CAT', 'column', 'Uniqueness', 'Data Drift', 'Expected maximum Cohen''s H Difference', 'You can think of this as a test of similarity that measures whether the percentage of unique values is consistent with the percentage at baseline.  A significant change might indicate duplication or a telling shift in cardinality between entities. The test uses Cohen''s H, a statistical test to identify a significant difference between two ratios.  Results are reported on a standardized scale, which can be interpreted via a rule-of-thumb from small to huge.  You can refine the expected threshold value as you view legitimate results of the measure over time.', 'Y'),