Skip to content

Commit 8f282be

Browse files
committed
Update version to 0.16.11, enhance Open Data Contract Standard (ODCS) support with data validations and constraints, and implement single file output consolidation for file data sources. Add new metadata extraction features and improve handling of CSV headers during consolidation.
1 parent 6b91fe7 commit 8f282be

16 files changed

Lines changed: 1449 additions & 33 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ and deep dive into issues [from the generated report](https://data.catering/late
3838

3939
1. Docker
4040
```shell
41-
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.16.10
41+
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.16.11
4242
```
4343
[Open localhost:9898](http://localhost:9898).
4444
1. [Run Scala/Java examples](#run-scalajava-examples)

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardDataSourceMetadata.scala

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,24 @@ case class OpenDataContractStandardDataSourceMetadata(
2424
}
2525

2626
override def getDataSourceValidations(dataSourceReadOptions: Map[String, String]): List[ValidationBuilder] = {
27-
//TODO wait until v3 before using data quality since ODCS uses rules from closed source system
28-
List()
27+
val optDataContractFile = connectionConfig.get(DATA_CONTRACT_FILE)
28+
optDataContractFile match {
29+
case Some(dataContractPath) =>
30+
val dataContractFile = new File(dataContractPath)
31+
val tryParseYamlV3 = Try(ObjectMapperUtil.yamlObjectMapper.readValue(dataContractFile, classOf[OpenDataContractStandardV3]))
32+
tryParseYamlV3 match {
33+
case Success(contract) =>
34+
// Extract validations from all schemas
35+
val validations = contract.schema.map(_.flatMap(schema => {
36+
OpenDataContractStandardDataValidations.getDataValidations(schema)
37+
}).toList).getOrElse(List())
38+
validations
39+
case Failure(_) =>
40+
// Not a v3 contract, skip validations (v2 doesn't have quality checks)
41+
List()
42+
}
43+
case None => List()
44+
}
2945
}
3046

3147
override def getSubDataSourcesMetadata(implicit sparkSession: SparkSession): Array[SubDataSourceMetadata] = {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard
2+
3+
import io.github.datacatering.datacaterer.api.ValidationBuilder
4+
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{DataQualityTypeEnum, OpenDataContractStandardDataQualityV3, OpenDataContractStandardElementV3, OpenDataContractStandardSchemaV3}
5+
import org.apache.log4j.Logger
6+
7+
object OpenDataContractStandardDataValidations {
8+
9+
private val LOGGER = Logger.getLogger(getClass.getName)
10+
11+
/**
12+
* Convert ODCS data quality rules to Data Caterer validations
13+
* @param schema The ODCS schema containing quality rules
14+
* @return List of validation builders
15+
*/
16+
def getDataValidations(schema: OpenDataContractStandardSchemaV3): List[ValidationBuilder] = {
17+
val schemaLevelValidations = schema.quality.map(_.flatMap(convertQualityCheck(_, None)).toList).getOrElse(List())
18+
val propertyLevelValidations = schema.properties.map(_.flatMap(property => {
19+
property.quality.map(_.flatMap(convertQualityCheck(_, Some(property))).toList).getOrElse(List())
20+
}).toList).getOrElse(List())
21+
22+
val allValidations = schemaLevelValidations ++ propertyLevelValidations
23+
if (allValidations.nonEmpty) {
24+
LOGGER.info(s"Converted ODCS quality checks to validations, schema=${schema.name}, num-validations=${allValidations.size}")
25+
}
26+
allValidations
27+
}
28+
29+
private def convertQualityCheck(
30+
quality: OpenDataContractStandardDataQualityV3,
31+
optProperty: Option[OpenDataContractStandardElementV3]
32+
): Option[ValidationBuilder] = {
33+
quality.`type` match {
34+
case DataQualityTypeEnum.library => convertLibraryQualityCheck(quality, optProperty)
35+
case DataQualityTypeEnum.sql => convertSqlQualityCheck(quality, optProperty)
36+
case DataQualityTypeEnum.text => None // Text is just documentation
37+
case DataQualityTypeEnum.custom => convertCustomQualityCheck(quality, optProperty)
38+
}
39+
}
40+
41+
private def convertLibraryQualityCheck(
42+
quality: OpenDataContractStandardDataQualityV3,
43+
optProperty: Option[OpenDataContractStandardElementV3]
44+
): Option[ValidationBuilder] = {
45+
// Get the field name from quality.field or optProperty
46+
val fieldName = quality.field.orElse(optProperty.map(_.name))
47+
48+
quality.rule.map(_.toLowerCase) match {
49+
case Some("nullcheck") | Some("null_check") =>
50+
fieldName.map(field => {
51+
val validation = ValidationBuilder().field(field).isNull(negate = true)
52+
addDescriptionAndThreshold(validation, quality)
53+
})
54+
55+
case Some("uniquecheck") | Some("unique_check") =>
56+
fieldName.map(field => {
57+
val validation = ValidationBuilder().unique(field)
58+
addDescriptionAndThreshold(validation, quality)
59+
})
60+
61+
case Some("countcheck") | Some("count_check") =>
62+
// Row count validation using groupBy and count
63+
val expr = if (quality.mustBeBetween.isDefined && quality.mustBeBetween.get.length == 2) {
64+
val range = quality.mustBeBetween.get
65+
s"count >= ${range(0)} AND count <= ${range(1)}"
66+
} else if (quality.mustBe.isDefined) {
67+
s"count == ${quality.mustBe.get}"
68+
} else if (quality.mustBeGreaterThan.isDefined) {
69+
s"count > ${quality.mustBeGreaterThan.get}"
70+
} else if (quality.mustBeGreaterOrEqualTo.isDefined) {
71+
s"count >= ${quality.mustBeGreaterOrEqualTo.get}"
72+
} else if (quality.mustBeLessThan.isDefined) {
73+
s"count < ${quality.mustBeLessThan.get}"
74+
} else if (quality.mustBeLessOrEqualTo.isDefined) {
75+
s"count <= ${quality.mustBeLessOrEqualTo.get}"
76+
} else {
77+
"count >= 0"
78+
}
79+
val validation = ValidationBuilder().groupBy().count().expr(expr)
80+
Some(addDescriptionAndThreshold(validation, quality))
81+
82+
case Some(rule) if rule.contains("between") =>
83+
// Generic between check
84+
quality.mustBeBetween.flatMap(range => {
85+
if (range.length == 2) {
86+
fieldName.map(field => {
87+
val validation = ValidationBuilder().field(field).between(range(0), range(1))
88+
addDescriptionAndThreshold(validation, quality)
89+
})
90+
} else None
91+
})
92+
93+
case Some(rule) if rule.contains("greaterthan") || rule.contains("greater_than") =>
94+
quality.mustBeGreaterThan.flatMap(value => {
95+
fieldName.map(field => {
96+
val validation = ValidationBuilder().field(field).greaterThan(value)
97+
addDescriptionAndThreshold(validation, quality)
98+
})
99+
})
100+
101+
case Some(rule) if rule.contains("lessthan") || rule.contains("less_than") =>
102+
quality.mustBeLessThan.flatMap(value => {
103+
fieldName.map(field => {
104+
val validation = ValidationBuilder().field(field).lessThan(value)
105+
addDescriptionAndThreshold(validation, quality)
106+
})
107+
})
108+
109+
case Some(rule) if rule.contains("inset") || rule.contains("in_set") =>
110+
quality.mustBe match {
111+
case Some(values: Iterable[_]) =>
112+
fieldName.map(field => {
113+
val validation = ValidationBuilder().field(field).in(values.map(_.toString).toSeq: _*)
114+
addDescriptionAndThreshold(validation, quality)
115+
})
116+
case _ => None
117+
}
118+
119+
case Some(rule) if rule.contains("matchespattern") || rule.contains("matches_pattern") =>
120+
quality.mustBe.flatMap(pattern => {
121+
fieldName.map(field => {
122+
val validation = ValidationBuilder().field(field).matches(pattern.toString)
123+
addDescriptionAndThreshold(validation, quality)
124+
})
125+
})
126+
127+
case _ =>
128+
LOGGER.warn(s"Unknown ODCS library quality check rule: ${quality.rule.getOrElse("none")}, dimension=${quality.dimension.getOrElse("none")}")
129+
None
130+
}
131+
}
132+
133+
private def convertSqlQualityCheck(
134+
quality: OpenDataContractStandardDataQualityV3,
135+
optProperty: Option[OpenDataContractStandardElementV3]
136+
): Option[ValidationBuilder] = {
137+
quality.query.map(sql => {
138+
val validation = ValidationBuilder().expr(sql)
139+
addDescriptionAndThreshold(validation, quality)
140+
})
141+
}
142+
143+
private def convertCustomQualityCheck(
144+
quality: OpenDataContractStandardDataQualityV3,
145+
optProperty: Option[OpenDataContractStandardElementV3]
146+
): Option[ValidationBuilder] = {
147+
// For custom checks, we can try to use the implementation or code fields as SQL expressions
148+
quality.implementation
149+
.orElse(quality.code)
150+
.map(expr => {
151+
val validation = ValidationBuilder().expr(expr)
152+
addDescriptionAndThreshold(validation, quality)
153+
})
154+
}
155+
156+
private def addDescriptionAndThreshold(
157+
validation: ValidationBuilder,
158+
quality: OpenDataContractStandardDataQualityV3
159+
): ValidationBuilder = {
160+
var result = validation
161+
162+
// Add description
163+
quality.description.orElse(quality.name).foreach(desc => {
164+
result = result.description(desc)
165+
})
166+
167+
// Add error threshold based on severity
168+
// For "error" severity, we want strict validation (no threshold)
169+
// For "warning", we can be more lenient
170+
quality.severity match {
171+
case Some("warning") | Some("info") =>
172+
// Allow up to 5% failures for warnings
173+
result = result.errorThreshold(0.05)
174+
case _ =>
175+
// For error or no severity, keep strict (default behavior)
176+
}
177+
178+
result
179+
}
180+
}

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/OpenDataContractStandardV3Mapper.scala

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
package io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard
22

3-
import io.github.datacatering.datacaterer.api.model.Constants.{ENABLED_NULL, FIELD_DATA_TYPE, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, METADATA_IDENTIFIER, PRIMARY_KEY_POSITION}
3+
import io.github.datacatering.datacaterer.api.model.Constants.{ENABLED_NULL, FIELD_DATA_TYPE, FORMAT, IS_NULLABLE, IS_PRIMARY_KEY, IS_UNIQUE, MAXIMUM, MAXIMUM_LENGTH, METADATA_IDENTIFIER, MINIMUM, MINIMUM_LENGTH, PRIMARY_KEY_POSITION, REGEX_GENERATOR}
44
import io.github.datacatering.datacaterer.api.model.{ArrayType, BooleanType, DataType, DateType, DoubleType, IntegerType, StringType, StructType}
55
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.SubDataSourceMetadata
66
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.database.FieldMetadata
7-
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{LogicalTypeEnum, OpenDataContractStandardElementV3, OpenDataContractStandardSchemaV3, OpenDataContractStandardV3}
7+
import io.github.datacatering.datacaterer.core.generator.metadata.datasource.opendatacontractstandard.model.{LogicalTypeEnum, OpenDataContractStandardElementV3, OpenDataContractStandardLogicalTypeOptionsV3, OpenDataContractStandardSchemaV3, OpenDataContractStandardV3}
88
import org.apache.log4j.Logger
99
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
1010

1111
object OpenDataContractStandardV3Mapper {
1212

1313
private val LOGGER = Logger.getLogger(getClass.getName)
14+
private val ODCS_CLASSIFICATION = "odcsClassification"
15+
private val ODCS_EXAMPLES = "odcsExamples"
1416

1517
implicit val fieldMetadataEncoder: Encoder[FieldMetadata] = Encoders.kryo[FieldMetadata]
1618

@@ -51,14 +53,53 @@ object OpenDataContractStandardV3Mapper {
5153
}
5254

5355
private def getBasePropertyMetadata(property: OpenDataContractStandardElementV3, dataType: DataType): Map[String, String] = {
54-
Map(
56+
val baseMetadata = Map(
5557
FIELD_DATA_TYPE -> dataType.toString(),
5658
IS_NULLABLE -> property.required.getOrElse(false).toString,
5759
ENABLED_NULL -> property.required.getOrElse(false).toString,
5860
IS_PRIMARY_KEY -> property.primaryKey.getOrElse(false).toString,
5961
PRIMARY_KEY_POSITION -> property.primaryKeyPosition.getOrElse("-1").toString,
6062
IS_UNIQUE -> property.unique.getOrElse(false).toString,
6163
)
64+
65+
// Add logicalTypeOptions metadata for data generation
66+
val typeOptionsMetadata = property.logicalTypeOptions.map(getLogicalTypeOptionsMetadata).getOrElse(Map.empty)
67+
68+
// Add examples as metadata (for documentation/reference only, not for generation)
69+
val examplesMetadata = property.examples match {
70+
case Some(examples) if examples.nonEmpty =>
71+
val examplesStr = examples.map(_.toString).mkString(",")
72+
Map(ODCS_EXAMPLES -> examplesStr)
73+
case _ => Map.empty
74+
}
75+
76+
// Add classification if present
77+
val classificationMetadata = property.classification match {
78+
case Some(classification) => Map(ODCS_CLASSIFICATION -> classification)
79+
case _ => Map.empty
80+
}
81+
82+
baseMetadata ++ typeOptionsMetadata ++ examplesMetadata ++ classificationMetadata
83+
}
84+
85+
private def getLogicalTypeOptionsMetadata(options: OpenDataContractStandardLogicalTypeOptionsV3): Map[String, String] = {
86+
var metadata = Map.empty[String, String]
87+
88+
// String constraints
89+
options.minLength.foreach(v => metadata += (MINIMUM_LENGTH -> v.toString))
90+
options.maxLength.foreach(v => metadata += (MAXIMUM_LENGTH -> v.toString))
91+
options.pattern.foreach(v => metadata += (REGEX_GENERATOR -> v))
92+
options.format.foreach(v => metadata += (FORMAT -> v))
93+
94+
// Numeric constraints (min/max supported by Data Caterer)
95+
options.minimum.foreach(v => metadata += (MINIMUM -> v.toString))
96+
options.maximum.foreach(v => metadata += (MAXIMUM -> v.toString))
97+
98+
// Note: multipleOf is not supported by Data Caterer's generation system
99+
// Note: Array constraints (minItems, maxItems, uniqueItems) and object constraints
100+
// are not currently used in Data Caterer's field-level generation
101+
102+
metadata
62103
}
63104

64105
private def getDataType(element: OpenDataContractStandardElementV3): DataType = {

app/src/main/scala/io/github/datacatering/datacaterer/core/generator/metadata/datasource/opendatacontractstandard/model/OpenDataContractStandardV3Models.scala

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ case class OpenDataContractStandardSchemaV3(
101101
properties: Option[Array[OpenDataContractStandardElementV3]] = None,
102102
priorTableName: Option[String] = None,
103103
quality: Option[Array[OpenDataContractStandardDataQualityV3]] = None,
104+
relationships: Option[Array[OpenDataContractStandardRelationshipSchemaLevelV3]] = None,
104105
tags: Option[Array[String]] = None,
105106
)
106107

@@ -124,6 +125,7 @@ case class OpenDataContractStandardElementV3(
124125
primaryKey: Option[Boolean] = None,
125126
primaryKeyPosition: Option[Int] = None,
126127
quality: Option[Array[OpenDataContractStandardDataQualityV3]] = None,
128+
relationships: Option[Array[OpenDataContractStandardRelationshipV3]] = None,
127129
required: Option[Boolean] = None,
128130
tags: Option[Array[String]] = None,
129131
transformDescription: Option[String] = None,
@@ -217,3 +219,31 @@ case class OpenDataContractStandardTeam(
217219
replacedByUsername: Option[String] = None,
218220
role: Option[String] = None,
219221
)
222+
223+
/**
224+
* Relationship at property level (foreign key from this property to another)
225+
* @param to The target property path (e.g., "schema.table.column")
226+
* @param `type` Type of relationship (e.g., "foreignKey", "references")
227+
* @param description Description of the relationship
228+
*/
229+
@JsonIgnoreProperties(ignoreUnknown = true)
230+
case class OpenDataContractStandardRelationshipV3(
231+
to: String,
232+
`type`: Option[String] = None,
233+
description: Option[String] = None
234+
)
235+
236+
/**
237+
* Relationship at schema level (foreign key between properties)
238+
* @param from The source property path
239+
* @param to The target property path
240+
* @param `type` Type of relationship (e.g., "foreignKey", "references")
241+
* @param description Description of the relationship
242+
*/
243+
@JsonIgnoreProperties(ignoreUnknown = true)
244+
case class OpenDataContractStandardRelationshipSchemaLevelV3(
245+
from: String,
246+
to: String,
247+
`type`: Option[String] = None,
248+
description: Option[String] = None
249+
)

0 commit comments

Comments
 (0)