|
| 1 | +package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard |
| 2 | + |
| 3 | +import io.github.datacatering.datacaterer.api.ValidationBuilder |
| 4 | +import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{DataQualityTypeEnum, OpenDataContractStandardDataQualityV3, OpenDataContractStandardElementV3, OpenDataContractStandardSchemaV3} |
| 5 | +import org.apache.log4j.Logger |
| 6 | + |
| 7 | +object OpenDataContractStandardDataValidations { |
| 8 | + |
| 9 | + private val LOGGER = Logger.getLogger(getClass.getName) |
| 10 | + |
| 11 | + /** |
| 12 | + * Convert ODCS data quality rules to Data Caterer validations |
| 13 | + * @param schema The ODCS schema containing quality rules |
| 14 | + * @return List of validation builders |
| 15 | + */ |
| 16 | + def getDataValidations(schema: OpenDataContractStandardSchemaV3): List[ValidationBuilder] = { |
| 17 | + val schemaLevelValidations = schema.quality.map(_.flatMap(convertQualityCheck(_, None)).toList).getOrElse(List()) |
| 18 | + val propertyLevelValidations = schema.properties.map(_.flatMap(property => { |
| 19 | + property.quality.map(_.flatMap(convertQualityCheck(_, Some(property))).toList).getOrElse(List()) |
| 20 | + }).toList).getOrElse(List()) |
| 21 | + |
| 22 | + val allValidations = schemaLevelValidations ++ propertyLevelValidations |
| 23 | + if (allValidations.nonEmpty) { |
| 24 | + LOGGER.info(s"Converted ODCS quality checks to validations, schema=${schema.name}, num-validations=${allValidations.size}") |
| 25 | + } |
| 26 | + allValidations |
| 27 | + } |
| 28 | + |
| 29 | + private def convertQualityCheck( |
| 30 | + quality: OpenDataContractStandardDataQualityV3, |
| 31 | + optProperty: Option[OpenDataContractStandardElementV3] |
| 32 | + ): Option[ValidationBuilder] = { |
| 33 | + quality.`type` match { |
| 34 | + case DataQualityTypeEnum.library => convertLibraryQualityCheck(quality, optProperty) |
| 35 | + case DataQualityTypeEnum.sql => convertSqlQualityCheck(quality, optProperty) |
| 36 | + case DataQualityTypeEnum.text => None // Text is just documentation |
| 37 | + case DataQualityTypeEnum.custom => convertCustomQualityCheck(quality, optProperty) |
| 38 | + } |
| 39 | + } |
| 40 | + |
| 41 | + private def convertLibraryQualityCheck( |
| 42 | + quality: OpenDataContractStandardDataQualityV3, |
| 43 | + optProperty: Option[OpenDataContractStandardElementV3] |
| 44 | + ): Option[ValidationBuilder] = { |
| 45 | + // Get the field name from quality.field or optProperty |
| 46 | + val fieldName = quality.field.orElse(optProperty.map(_.name)) |
| 47 | + |
| 48 | + quality.rule.map(_.toLowerCase) match { |
| 49 | + case Some("nullcheck") | Some("null_check") => |
| 50 | + fieldName.map(field => { |
| 51 | + val validation = ValidationBuilder().field(field).isNull(negate = true) |
| 52 | + addDescriptionAndThreshold(validation, quality) |
| 53 | + }) |
| 54 | + |
| 55 | + case Some("uniquecheck") | Some("unique_check") => |
| 56 | + fieldName.map(field => { |
| 57 | + val validation = ValidationBuilder().unique(field) |
| 58 | + addDescriptionAndThreshold(validation, quality) |
| 59 | + }) |
| 60 | + |
| 61 | + case Some("countcheck") | Some("count_check") => |
| 62 | + // Row count validation using groupBy and count |
| 63 | + val expr = if (quality.mustBeBetween.isDefined && quality.mustBeBetween.get.length == 2) { |
| 64 | + val range = quality.mustBeBetween.get |
| 65 | + s"count >= ${range(0)} AND count <= ${range(1)}" |
| 66 | + } else if (quality.mustBe.isDefined) { |
| 67 | + s"count == ${quality.mustBe.get}" |
| 68 | + } else if (quality.mustBeGreaterThan.isDefined) { |
| 69 | + s"count > ${quality.mustBeGreaterThan.get}" |
| 70 | + } else if (quality.mustBeGreaterOrEqualTo.isDefined) { |
| 71 | + s"count >= ${quality.mustBeGreaterOrEqualTo.get}" |
| 72 | + } else if (quality.mustBeLessThan.isDefined) { |
| 73 | + s"count < ${quality.mustBeLessThan.get}" |
| 74 | + } else if (quality.mustBeLessOrEqualTo.isDefined) { |
| 75 | + s"count <= ${quality.mustBeLessOrEqualTo.get}" |
| 76 | + } else { |
| 77 | + "count >= 0" |
| 78 | + } |
| 79 | + val validation = ValidationBuilder().groupBy().count().expr(expr) |
| 80 | + Some(addDescriptionAndThreshold(validation, quality)) |
| 81 | + |
| 82 | + case Some(rule) if rule.contains("between") => |
| 83 | + // Generic between check |
| 84 | + quality.mustBeBetween.flatMap(range => { |
| 85 | + if (range.length == 2) { |
| 86 | + fieldName.map(field => { |
| 87 | + val validation = ValidationBuilder().field(field).between(range(0), range(1)) |
| 88 | + addDescriptionAndThreshold(validation, quality) |
| 89 | + }) |
| 90 | + } else None |
| 91 | + }) |
| 92 | + |
| 93 | + case Some(rule) if rule.contains("greaterthan") || rule.contains("greater_than") => |
| 94 | + quality.mustBeGreaterThan.flatMap(value => { |
| 95 | + fieldName.map(field => { |
| 96 | + val validation = ValidationBuilder().field(field).greaterThan(value) |
| 97 | + addDescriptionAndThreshold(validation, quality) |
| 98 | + }) |
| 99 | + }) |
| 100 | + |
| 101 | + case Some(rule) if rule.contains("lessthan") || rule.contains("less_than") => |
| 102 | + quality.mustBeLessThan.flatMap(value => { |
| 103 | + fieldName.map(field => { |
| 104 | + val validation = ValidationBuilder().field(field).lessThan(value) |
| 105 | + addDescriptionAndThreshold(validation, quality) |
| 106 | + }) |
| 107 | + }) |
| 108 | + |
| 109 | + case Some(rule) if rule.contains("inset") || rule.contains("in_set") => |
| 110 | + quality.mustBe match { |
| 111 | + case Some(values: Iterable[_]) => |
| 112 | + fieldName.map(field => { |
| 113 | + val validation = ValidationBuilder().field(field).in(values.map(_.toString).toSeq: _*) |
| 114 | + addDescriptionAndThreshold(validation, quality) |
| 115 | + }) |
| 116 | + case _ => None |
| 117 | + } |
| 118 | + |
| 119 | + case Some(rule) if rule.contains("matchespattern") || rule.contains("matches_pattern") => |
| 120 | + quality.mustBe.flatMap(pattern => { |
| 121 | + fieldName.map(field => { |
| 122 | + val validation = ValidationBuilder().field(field).matches(pattern.toString) |
| 123 | + addDescriptionAndThreshold(validation, quality) |
| 124 | + }) |
| 125 | + }) |
| 126 | + |
| 127 | + case _ => |
| 128 | + LOGGER.warn(s"Unknown ODCS library quality check rule: ${quality.rule.getOrElse("none")}, dimension=${quality.dimension.getOrElse("none")}") |
| 129 | + None |
| 130 | + } |
| 131 | + } |
| 132 | + |
| 133 | + private def convertSqlQualityCheck( |
| 134 | + quality: OpenDataContractStandardDataQualityV3, |
| 135 | + optProperty: Option[OpenDataContractStandardElementV3] |
| 136 | + ): Option[ValidationBuilder] = { |
| 137 | + quality.query.map(sql => { |
| 138 | + val validation = ValidationBuilder().expr(sql) |
| 139 | + addDescriptionAndThreshold(validation, quality) |
| 140 | + }) |
| 141 | + } |
| 142 | + |
| 143 | + private def convertCustomQualityCheck( |
| 144 | + quality: OpenDataContractStandardDataQualityV3, |
| 145 | + optProperty: Option[OpenDataContractStandardElementV3] |
| 146 | + ): Option[ValidationBuilder] = { |
| 147 | + // For custom checks, we can try to use the implementation or code fields as SQL expressions |
| 148 | + quality.implementation |
| 149 | + .orElse(quality.code) |
| 150 | + .map(expr => { |
| 151 | + val validation = ValidationBuilder().expr(expr) |
| 152 | + addDescriptionAndThreshold(validation, quality) |
| 153 | + }) |
| 154 | + } |
| 155 | + |
| 156 | + private def addDescriptionAndThreshold( |
| 157 | + validation: ValidationBuilder, |
| 158 | + quality: OpenDataContractStandardDataQualityV3 |
| 159 | + ): ValidationBuilder = { |
| 160 | + var result = validation |
| 161 | + |
| 162 | + // Add description |
| 163 | + quality.description.orElse(quality.name).foreach(desc => { |
| 164 | + result = result.description(desc) |
| 165 | + }) |
| 166 | + |
| 167 | + // Add error threshold based on severity |
| 168 | + // For "error" severity, we want strict validation (no threshold) |
| 169 | + // For "warning", we can be more lenient |
| 170 | + quality.severity match { |
| 171 | + case Some("warning") | Some("info") => |
| 172 | + // Allow up to 5% failures for warnings |
| 173 | + result = result.errorThreshold(0.05) |
| 174 | + case _ => |
| 175 | + // For error or no severity, keep strict (default behavior) |
| 176 | + } |
| 177 | + |
| 178 | + result |
| 179 | + } |
| 180 | +} |
0 commit comments