Skip to content

Commit c6cc807

Browse files
committed
TG-885
1 parent 506ac5c commit c6cc807

3 files changed

Lines changed: 6 additions & 6 deletions

File tree

testgen/template/dbsetup/050_populate_new_schema_metadata.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ VALUES ('1001', 'Suggested_Type', 'Column', 'Suggested Data Type', 'Data stored
1818
ar%'' OR column_type ILIKE ''text'') AND NOT (datatype_suggestion ILIKE ''%char%'' OR datatype_suggestion ILIKE ''text'')', 'p.datatype_suggestion::VARCHAR(200)', 'Likely', 'Consider changing the column data type to tighte
1919
n controls over data ingested and to make values more efficient, consistent and suitable for downstream analysis.', NULL, NULL, NULL),
2020
('1002', 'Non_Standard_Blanks', 'Column', 'Non-Standard Blank Values', 'Values representing missing data may be unexpected or inconsistent. Non-standard values may include empty strings as opposed to nulls, dummy entries such as "MISSING" or repeated characters that may have been used to bypass entry requirements, processing artifacts such as "NULL", or spreadsheet artifacts such as "NA", "ERROR".', '(p.filled_value_ct > 0 OR p.zero_length_ct > 0)', '''Filled Values: '' || p.filled_value_ct::VARCHAR || '', Empty String: '' || p.zero_length_ct::VARCHAR || '', Null: '' || p.null_value_ct::VARCHAR || '', Records: '' || p.record_ct::VARCHAR', 'Definite', 'Consider cleansing the column upon ingestion to replace all variants of missing data with a standard designation, like Null.', 'p.filled_value_ct::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '1.0', 'Completeness'),
21-
('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct - p.zero_length_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Filled Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'),
21+
('1003', 'Invalid_Zip_USA', 'Column', 'Invalid USA Zip Code Format', 'Some values present do not conform with the expected format of USA Zip Codes.', 'p.functional_data_type = ''ZIP_USA'' AND (p.general_type <> ''A'' OR p.filled_value_ct > 0 OR p.min_length >= 1 AND p.min_length <= 4 OR p.max_length > 10)', 'CASE WHEN p.general_type = ''N'' THEN ''Type: '' || p.column_type || '', '' ELSE '''' END || ''Min Length: '' || p.min_length::VARCHAR || '', Max Length: '' || p.max_length::VARCHAR || '', Dummy Values: '' || p.filled_value_ct::VARCHAR', 'Definite', 'Consider correcting invalid column values or changing them to indicate a missing value if corrections cannot be made.', NULL, '1.0', 'Validity'),
2222
('1004', 'Multiple_Types_Minor', 'Multi-Col', 'Multiple Data Types per Column Name - Minor', 'Columns with the same name have the same general type across tables, but the types do not exactly match. Truncation issues may result if columns are commingled and assumed to be the same format.', 'm.general_type_ct = 1 AND m.type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Possible', 'Consider changing the column data types to be fully consistent. This will tighten your standards at ingestion and assure that data is consistent between tables.', NULL, NULL, 'Consistency'),
2323
('1005', 'Multiple_Types_Major', 'Multi-Col', 'Multiple Data Types per Column Name - Major', 'Columns with the same name have broadly different types across tables. Differences could be significant enough to cause errors in downstream analysis, extra steps resulting in divergent business logic and inconsistencies in results.', 'm.general_type_ct > 1', '''Found '' || m.column_ct::VARCHAR || '' columns, '' || m.type_ct::VARCHAR(10) || '' types, '' || m.min_type || '' to '' || m.max_type', 'Likely', 'Ideally, you should change the column data types to be fully consistent. If the data is meant to be different, you should change column names so downstream users aren''t led astray.', NULL, NULL, 'Consistency'),
24-
('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Filled: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33', 'Completeness'),
24+
('1006', 'No_Values', 'Column', 'No Column Values Present', 'This column is present in the table, but no values have been ingested or assigned in any records. This could indicate missing data or a processing error. Note that this considers dummy values and zero-length values as missing data. ', '(p.null_value_ct + p.filled_value_ct + p.zero_length_ct) = p.record_ct', '''Null: '' || p.null_value_ct::VARCHAR(10) || '', Dummy: '' || p.filled_value_ct::VARCHAR(10) || '', Zero Len: '' || p.zero_length_ct::VARCHAR(10)', 'Possible', 'Review your source data, ingestion process, and any processing steps that update this column.', '1.0', '0.33', 'Completeness'),
2525
('1007', 'Column_Pattern_Mismatch', 'Column', 'Pattern Inconsistency Within Column', 'Alpha-numeric string data within this column conforms to 2-4 different patterns, with 95% matching the first pattern. This could indicate data errors in the remaining values. ', 'p.general_type = ''A''
2626
AND p.max_length > 3
27-
AND p.value_ct > (p.numeric_ct + p.filled_value_ct)
27+
AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
2828
AND p.distinct_pattern_ct BETWEEN 2 AND 4
2929
AND STRPOS(p.top_patterns, ''N'') > 0
3030
AND (
@@ -35,7 +35,7 @@ n controls over data ingested and to make values more efficient, consistent and
3535
)', '''Patterns: '' || p.top_patterns', 'Likely', 'Review the values for any data that doesn''t conform to the most common pattern and correct any data errors.', '(p.record_ct - SPLIT_PART(p.top_patterns, ''|'', 1)::INT)::FLOAT/NULLIF(p.record_ct, 0)::FLOAT', '0.66', 'Validity'),
3636
('1008', 'Table_Pattern_Mismatch', 'Multi-Col', 'Pattern Inconsistency Across Tables', 'Alpha-numeric string data within this column matches a single pattern, but other columns with the same name have data that matches a different single pattern. Inconsistent formatting may contradict user assumptions and cause downstream errors, extra steps and inconsistent business logic.', 'p.general_type = ''A''
3737
AND p.max_length > 3
38-
AND p.value_ct > (p.numeric_ct + p.filled_value_ct)
38+
AND p.value_ct > (p.numeric_ct + p.filled_value_ct + p.zero_length_ct)
3939
AND m.max_pattern_ct = 1
4040
AND m.column_ct > 1
4141
AND SPLIT_PART(p.top_patterns, ''|'', 2) <> SPLIT_PART(m.very_top_pattern, ''|'', 2)

testgen/template/flavors/redshift/profiling/project_profiling_query_redshift.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ strTemplate05_A: COUNT(DISTINCT UPPER(TRANSLATE("{COL_NAME}", ' '',.-', ''))) a
3131
COUNT( CASE WHEN "{COL_NAME}" ILIKE '"%"' OR "{COL_NAME}" ILIKE '''%''' THEN 1 END ) as quoted_value_ct,
3232
COUNT( CASE WHEN "{COL_NAME}" ~ '[0-9]' THEN 1 END ) as includes_digit_ct,
3333
COUNT( CASE
34-
WHEN "{COL_NAME}" IN ('.', '?', ' ') THEN 1
34+
WHEN LENGTH("{COL_NAME}") > 0 AND "{COL_NAME}" IN ('.', '?', ' ') THEN 1
3535
WHEN LOWER("{COL_NAME}") SIMILAR TO '(^.{2,}|-{2,}|0{2,}|9{2,}|x{2,}|z{2,}$)' THEN 1
3636
WHEN LOWER("{COL_NAME}") IN ('blank','error','missing','tbd',
3737
'n/a','#na','none','null','unknown') THEN 1

testgen/template/profiling/functional_datatype.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ WHERE profile_run_id = '{PROFILE_RUN_ID}';
2424
UPDATE profile_results
2525
SET functional_data_type =
2626
CASE WHEN distinct_value_ct = 1
27-
AND (((value_ct :: float - coalesce(filled_value_ct, 0::bigint) :: float)/record_ct :: float) :: float *100.00 ) > 75
27+
AND (((value_ct - coalesce(filled_value_ct, 0) - coalesce(zero_length_ct, 0))::float/record_ct::float) * 100.00 ) > 75
2828
-- this tells us how much actual values we have filled in; threshold -> if there is only 1 value and it's 75% of the records -> then it's a constant
2929
THEN 'Constant'
3030
ELSE functional_data_type END

0 commit comments

Comments
 (0)