Skip to content

Commit 0b5fd84

Browse files
authored
Merge pull request #109 from data-catering/feature/single-file-odcs
Update version to 0.16.11, enhance Open Data Contract Standard (ODCS)…
2 parents 6b91fe7 + eec01af commit 0b5fd84

17 files changed

Lines changed: 1549 additions & 45 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ and deep dive into issues [from the generated report](https://data.catering/late
3838

3939
1. Docker
4040
```shell
41-
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.16.10
41+
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.16.11
4242
```
4343
[Open localhost:9898](http://localhost:9898).
4444
1. [Run Scala/Java examples](#run-scalajava-examples)

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadata.scala

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@ case class OpenDataContractStandardDataSourceMetadata(
2424
}
2525

2626
override def getDataSourceValidations(dataSourceReadOptions: Map[String, String]): List[ValidationBuilder] = {
27-
//TODO wait until v3 before using data quality since ODCS uses rules from closed source system
28-
List()
27+
val optDataContractFile = connectionConfig.get(DATA_CONTRACT_FILE)
28+
optDataContractFile match {
29+
case Some(dataContractPath) =>
30+
val dataContractFile = new File(dataContractPath)
31+
val tryParseYamlV3 = Try(ObjectMapperUtil.yamlObjectMapper.readValue(dataContractFile, classOf[OpenDataContractStandardV3]))
32+
tryParseYamlV3 match {
33+
case Success(contract) =>
34+
// Extract validations from all schemas
35+
val validations = contract.schema.map(_.flatMap(schema => {
36+
OpenDataContractStandardDataValidations.getDataValidations(schema)
37+
}).toList).getOrElse(List())
38+
validations
39+
case Failure(_) =>
40+
// Not a v3 contract, skip validations (v2 doesn't have quality checks)
41+
List()
42+
}
43+
case None => List()
44+
}
2945
}
3046

3147
override def getSubDataSourcesMetadata(implicit sparkSession: SparkSession): Array[SubDataSourceMetadata] = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard
2+
3+
import io.github.datacatering.datacaterer.api.ValidationBuilder
4+
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{DataQualityTypeEnum, OpenDataContractStandardDataQualityV3, OpenDataContractStandardElementV3, OpenDataContractStandardSchemaV3}
5+
import org.apache.log4j.Logger
6+
7+
object OpenDataContractStandardDataValidations {
8+
9+
private val LOGGER = Logger.getLogger(getClass.getName)
10+
11+
/**
12+
* Convert ODCS data quality rules to Data Caterer validations
13+
* @param schema The ODCS schema containing quality rules
14+
* @return List of validation builders
15+
*/
16+
def getDataValidations(schema: OpenDataContractStandardSchemaV3): List[ValidationBuilder] = {
17+
val schemaLevelValidations = schema.quality.map(_.flatMap(convertQualityCheck(_, None)).toList).getOrElse(List())
18+
val propertyLevelValidations = schema.properties.map(_.flatMap(property => {
19+
property.quality.map(_.flatMap(convertQualityCheck(_, Some(property))).toList).getOrElse(List())
20+
}).toList).getOrElse(List())
21+
22+
val allValidations = schemaLevelValidations ++ propertyLevelValidations
23+
if (allValidations.nonEmpty) {
24+
LOGGER.info(s"Converted ODCS quality checks to validations, schema=${schema.name}, num-validations=${allValidations.size}")
25+
}
26+
allValidations
27+
}
28+
29+
private def convertQualityCheck(
30+
quality: OpenDataContractStandardDataQualityV3,
31+
optProperty: Option[OpenDataContractStandardElementV3]
32+
): Option[ValidationBuilder] = {
33+
quality.`type` match {
34+
case DataQualityTypeEnum.library => convertLibraryQualityCheck(quality, optProperty)
35+
case DataQualityTypeEnum.sql => convertSqlQualityCheck(quality, optProperty)
36+
case DataQualityTypeEnum.text => None // Text is just documentation
37+
case DataQualityTypeEnum.custom => convertCustomQualityCheck(quality, optProperty)
38+
}
39+
}
40+
41+
private def convertLibraryQualityCheck(
42+
quality: OpenDataContractStandardDataQualityV3,
43+
optProperty: Option[OpenDataContractStandardElementV3]
44+
): Option[ValidationBuilder] = {
45+
// Get the field name from quality.field or optProperty
46+
val fieldName = quality.field.orElse(optProperty.map(_.name))
47+
48+
quality.rule.map(_.toLowerCase) match {
49+
case Some("nullcheck") | Some("null_check") =>
50+
fieldName.map(field => {
51+
val validation = ValidationBuilder().field(field).isNull(negate = true)
52+
addDescriptionAndThreshold(validation, quality)
53+
})
54+
55+
case Some("uniquecheck") | Some("unique_check") =>
56+
fieldName.map(field => {
57+
val validation = ValidationBuilder().unique(field)
58+
addDescriptionAndThreshold(validation, quality)
59+
})
60+
61+
case Some("countcheck") | Some("count_check") =>
62+
// Row count validation using groupBy and count
63+
val expr = if (quality.mustBeBetween.isDefined && quality.mustBeBetween.get.length == 2) {
64+
val range = quality.mustBeBetween.get
65+
s"count >= ${range(0)} AND count <= ${range(1)}"
66+
} else if (quality.mustBe.isDefined) {
67+
s"count == ${quality.mustBe.get}"
68+
} else if (quality.mustBeGreaterThan.isDefined) {
69+
s"count > ${quality.mustBeGreaterThan.get}"
70+
} else if (quality.mustBeGreaterOrEqualTo.isDefined) {
71+
s"count >= ${quality.mustBeGreaterOrEqualTo.get}"
72+
} else if (quality.mustBeLessThan.isDefined) {
73+
s"count < ${quality.mustBeLessThan.get}"
74+
} else if (quality.mustBeLessOrEqualTo.isDefined) {
75+
s"count <= ${quality.mustBeLessOrEqualTo.get}"
76+
} else {
77+
"count >= 0"
78+
}
79+
val validation = ValidationBuilder().groupBy().count().expr(expr)
80+
Some(addDescriptionAndThreshold(validation, quality))
81+
82+
case Some(rule) if rule.contains("between") =>
83+
// Generic between check
84+
quality.mustBeBetween.flatMap(range => {
85+
if (range.length == 2) {
86+
fieldName.map(field => {
87+
val validation = ValidationBuilder().field(field).between(range(0), range(1))
88+
addDescriptionAndThreshold(validation, quality)
89+
})
90+
} else None
91+
})
92+
93+
case Some(rule) if rule.contains("greaterthan") || rule.contains("greater_than") =>
94+
quality.mustBeGreaterThan.flatMap(value => {
95+
fieldName.map(field => {
96+
val validation = ValidationBuilder().field(field).greaterThan(value)
97+
addDescriptionAndThreshold(validation, quality)
98+
})
99+
})
100+
101+
case Some(rule) if rule.contains("lessthan") || rule.contains("less_than") =>
102+
quality.mustBeLessThan.flatMap(value => {
103+
fieldName.map(field => {
104+
val validation = ValidationBuilder().field(field).lessThan(value)
105+
addDescriptionAndThreshold(validation, quality)
106+
})
107+
})
108+
109+
case Some(rule) if rule.contains("inset") || rule.contains("in_set") =>
110+
quality.mustBe match {
111+
case Some(values: Iterable[_]) =>
112+
fieldName.map(field => {
113+
val validation = ValidationBuilder().field(field).in(values.map(_.toString).toSeq: _*)
114+
addDescriptionAndThreshold(validation, quality)
115+
})
116+
case _ => None
117+
}
118+
119+
case Some(rule) if rule.contains("matchespattern") || rule.contains("matches_pattern") =>
120+
quality.mustBe.flatMap(pattern => {
121+
fieldName.map(field => {
122+
val validation = ValidationBuilder().field(field).matches(pattern.toString)
123+
addDescriptionAndThreshold(validation, quality)
124+
})
125+
})
126+
127+
case _ =>
128+
LOGGER.warn(s"Unknown ODCS library quality check rule: ${quality.rule.getOrElse("none")}, dimension=${quality.dimension.getOrElse("none")}")
129+
None
130+
}
131+
}
132+
133+
private def convertSqlQualityCheck(
134+
quality: OpenDataContractStandardDataQualityV3,
135+
optProperty: Option[OpenDataContractStandardElementV3]
136+
): Option[ValidationBuilder] = {
137+
quality.query.map(sql => {
138+
val validation = ValidationBuilder().expr(sql)
139+
addDescriptionAndThreshold(validation, quality)
140+
})
141+
}
142+
143+
private def convertCustomQualityCheck(
144+
quality: OpenDataContractStandardDataQualityV3,
145+
optProperty: Option[OpenDataContractStandardElementV3]
146+
): Option[ValidationBuilder] = {
147+
// For custom checks, we can try to use the implementation or code fields as SQL expressions
148+
quality.implementation
149+
.orElse(quality.code)
150+
.map(expr => {
151+
val validation = ValidationBuilder().expr(expr)
152+
addDescriptionAndThreshold(validation, quality)
153+
})
154+
}
155+
156+
private def addDescriptionAndThreshold(
157+
validation: ValidationBuilder,
158+
quality: OpenDataContractStandardDataQualityV3
159+
): ValidationBuilder = {
160+
var result = validation
161+
162+
// Add description
163+
quality.description.orElse(quality.name).foreach(desc => {
164+
result = result.description(desc)
165+
})
166+
167+
// Add error threshold based on severity
168+
// For "error" severity, we want strict validation (no threshold)
169+
// For "warning", we can be more lenient
170+
quality.severity match {
171+
case Some("warning") | Some("info") =>
172+
// Allow up to 5% failures for warnings
173+
result = result.errorThreshold(0.05)
174+
case _ =>
175+
// For error or no severity, keep strict (default behavior)
176+
}
177+
178+
result
179+
}
180+
}

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardV3Mapper.scala

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard
22

3-
import io.github.datacatering.datacaterer.api.model.Constants.{ENABLED_NULL, FIELD_DATA_TYPE, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, METADATA_IDENTIFIER, PRIMARY_KEY_POSITION}
3+
import io.github.datacatering.datacaterer.api.model.Constants.{ENABLED_NULL, FIELD_DATA_TYPE, FORMAT, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, METADATA_IDENTIFIER, MINIMUM, MINIMUM_LENGTH, PRIMARY_KEY_POSITION, REGEX_GENERATOR}
44
import io.github.datacatering.datacaterer.api.model.{ArrayType, BooleanType, DataType, DateType, DoubleType, IntegerType, StringType, StructType}
55
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.SubDataSourceMetadata
66
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.database.FieldMetadata
7-
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{LogicalTypeEnum, OpenDataContractStandardElementV3, OpenDataContractStandardSchemaV3, OpenDataContractStandardV3}
7+
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{LogicalTypeEnum, OpenDataContractStandardElementV3, OpenDataContractStandardLogicalTypeOptionsV3, OpenDataContractStandardSchemaV3, OpenDataContractStandardV3}
88
import org.apache.log4j.Logger
99
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
1010

1111
object OpenDataContractStandardV3Mapper {
1212

1313
private val LOGGER = Logger.getLogger(getClass.getName)
14+
private val ODCS_CLASSIFICATION = "odcsClassification"
15+
private val ODCS_EXAMPLES = "odcsExamples"
1416

1517
implicit val fieldMetadataEncoder: Encoder[FieldMetadata] = Encoders.kryo[FieldMetadata]
1618

@@ -51,14 +53,55 @@ object OpenDataContractStandardV3Mapper {
5153
}
5254

5355
private def getBasePropertyMetadata(property: OpenDataContractStandardElementV3, dataType: DataType): Map[String, String] = {
54-
Map(
56+
// required=true means NOT nullable; required=false means nullable
57+
val isNullable = !property.required.getOrElse(false)
58+
val baseMetadata = Map(
5559
FIELD_DATA_TYPE -> dataType.toString(),
56-
IS_NULLABLE -> property.required.getOrElse(false).toString,
57-
ENABLED_NULL -> property.required.getOrElse(false).toString,
60+
IS_NULLABLE -> isNullable.toString,
61+
ENABLED_NULL -> isNullable.toString,
5862
IS_PRIMARY_KEY -> property.primaryKey.getOrElse(false).toString,
5963
PRIMARY_KEY_POSITION -> property.primaryKeyPosition.getOrElse("-1").toString,
6064
IS_UNIQUE -> property.unique.getOrElse(false).toString,
6165
)
66+
67+
// Add logicalTypeOptions metadata for data generation
68+
val typeOptionsMetadata = property.logicalTypeOptions.map(getLogicalTypeOptionsMetadata).getOrElse(Map.empty)
69+
70+
// Add examples as metadata (for documentation/reference only, not for generation)
71+
val examplesMetadata = property.examples match {
72+
case Some(examples) if examples.nonEmpty =>
73+
val examplesStr = examples.map(_.toString).mkString(",")
74+
Map(ODCS_EXAMPLES -> examplesStr)
75+
case _ => Map.empty
76+
}
77+
78+
// Add classification if present
79+
val classificationMetadata = property.classification match {
80+
case Some(classification) => Map(ODCS_CLASSIFICATION -> classification)
81+
case _ => Map.empty
82+
}
83+
84+
baseMetadata ++ typeOptionsMetadata ++ examplesMetadata ++ classificationMetadata
85+
}
86+
87+
private def getLogicalTypeOptionsMetadata(options: OpenDataContractStandardLogicalTypeOptionsV3): Map[String, String] = {
88+
var metadata = Map.empty[String, String]
89+
90+
// String constraints
91+
options.minLength.foreach(v => metadata += (MINIMUM_LENGTH -> v.toString))
92+
options.maxLength.foreach(v => metadata += (MAXIMUM_LENGTH -> v.toString))
93+
options.pattern.foreach(v => metadata += (REGEX_GENERATOR -> v))
94+
options.format.foreach(v => metadata += (FORMAT -> v))
95+
96+
// Numeric constraints (min/max supported by Data Caterer)
97+
options.minimum.foreach(v => metadata += (MINIMUM -> v.toString))
98+
options.maximum.foreach(v => metadata += (MAXIMUM -> v.toString))
99+
100+
// Note: multipleOf is not supported by Data Caterer's generation system
101+
// Note: Array constraints (minItems, maxItems, uniqueItems) and object constraints
102+
// are not currently used in Data Caterer's field-level generation
103+
104+
metadata
62105
}
63106

64107
private def getDataType(element: OpenDataContractStandardElementV3): DataType = {

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ case class OpenDataContractStandardSchemaV3(
101101
properties: Option[Array[OpenDataContractStandardElementV3]] = None,
102102
priorTableName: Option[String] = None,
103103
quality: Option[Array[OpenDataContractStandardDataQualityV3]] = None,
104+
relationships: Option[Array[OpenDataContractStandardRelationshipSchemaLevelV3]] = None,
104105
tags: Option[Array[String]] = None,
105106
)
106107

@@ -124,6 +125,7 @@ case class OpenDataContractStandardElementV3(
124125
primaryKey: Option[Boolean] = None,
125126
primaryKeyPosition: Option[Int] = None,
126127
quality: Option[Array[OpenDataContractStandardDataQualityV3]] = None,
128+
relationships: Option[Array[OpenDataContractStandardRelationshipV3]] = None,
127129
required: Option[Boolean] = None,
128130
tags: Option[Array[String]] = None,
129131
transformDescription: Option[String] = None,
@@ -217,3 +219,31 @@ case class OpenDataContractStandardTeam(
217219
replacedByUsername: Option[String] = None,
218220
role: Option[String] = None,
219221
)
222+
223+
/**
224+
* Relationship at property level (foreign key from this property to another)
225+
* @param to The target property path (e.g., "schema.table.column")
226+
* @param `type` Type of relationship (e.g., "foreignKey", "references")
227+
* @param description Description of the relationship
228+
*/
229+
@JsonIgnoreProperties(ignoreUnknown = true)
230+
case class OpenDataContractStandardRelationshipV3(
231+
to: String,
232+
`type`: Option[String] = None,
233+
description: Option[String] = None
234+
)
235+
236+
/**
237+
* Relationship at schema level (foreign key between properties)
238+
* @param from The source property path
239+
* @param to The target property path
240+
* @param `type` Type of relationship (e.g., "foreignKey", "references")
241+
* @param description Description of the relationship
242+
*/
243+
@JsonIgnoreProperties(ignoreUnknown = true)
244+
case class OpenDataContractStandardRelationshipSchemaLevelV3(
245+
from: String,
246+
to: String,
247+
`type`: Option[String] = None,
248+
description: Option[String] = None
249+
)

0 commit comments

Comments
 (0)