Skip to content

Commit 81d19fc

Browse files
committed
Merge branch 'main' of github.com:data-catering/data-caterer
2 parents 423dbc0 + b6ccde2 commit 81d19fc

365 files changed

Lines changed: 37846 additions & 134 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/benchmark.yml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
name: Run performance benchmark tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
build:
10+
runs-on: ubuntu-latest
11+
defaults:
12+
run:
13+
working-directory: example
14+
steps:
15+
- name: Checkout monorepo
16+
uses: actions/checkout@v4
17+
with:
18+
fetch-depth: 2
19+
- name: Check if benchmark has already run
20+
run: |
21+
version=$(grep dataCatererVersion gradle.properties | cut -d= -f2)
22+
if [ ! -f benchmark/results/benchmark_results_${version}.txt ]; then
23+
echo "No benchmark results for version: $version, starting to run benchmarks"
24+
else
25+
echo "Benchmarks already run!"
26+
exit 1
27+
fi
28+
- name: Checkout datafusion-comet repo
29+
uses: actions/checkout@v4
30+
with:
31+
fetch-depth: 2
32+
repository: apache/datafusion-comet
33+
path: example/benchmark/build/datafusion-comet
34+
- name: Get Spark query engine jars
35+
run: bash benchmark/setup_query_engine_jars.sh
36+
- name: Run benchmark script
37+
env:
38+
DATA_CATERER_MANAGEMENT_TRACK: ${{ secrets.DATA_CATERER_MANAGEMENT_TRACK }}
39+
run: |
40+
version=$(grep dataCatererVersion gradle.properties | cut -d= -f2)
41+
bash benchmark/run_benchmark.sh
42+
bash benchmark/compare_benchmark_results.sh "$version"
43+
- name: Create pull request
44+
uses: peter-evans/create-pull-request@v6
45+
with:
46+
title: Add benchmark results

.github/workflows/docs-ci.yml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
name: docs
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
permissions:
9+
contents: write
10+
11+
jobs:
12+
deploy:
13+
runs-on: ubuntu-latest
14+
steps:
15+
- uses: actions/checkout@v4
16+
with:
17+
fetch-depth: 0
18+
- uses: actions/setup-python@v5
19+
with:
20+
python-version: '3.x'
21+
- name: Configure Git Credentials
22+
run: |
23+
git config user.name github-actions[bot]
24+
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
25+
- name: Set cache key
26+
run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
27+
- uses: actions/cache@v4
28+
with:
29+
key: mkdocs-material-${{ env.cache_id }}
30+
path: .cache
31+
restore-keys: |
32+
mkdocs-material-
33+
- run: sudo apt-get update && sudo apt-get install -y pngquant
34+
- run: pip install mkdocs-material mkdocs-open-in-new-tab "mkdocs-material[imaging]" mike
35+
- name: Deploy docs with mike
36+
run: |
37+
latest_tag=$(git describe --tags --abbrev=0)
38+
mike deploy --push --update-aliases "$latest_tag" latest
39+
env:
40+
GH_TOKEN: ${{ secrets.GH_TOKEN }}

.gitignore

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ tmp
1515
.bloop
1616
.metals
1717
.vscode
18+
site
19+
.cache
20+
21+
# Python/virtualenvs used by docs
22+
.venv
23+
.python-version
1824

1925
app/out
2026
app/src/test/resources/sample/parquet
@@ -30,8 +36,21 @@ app/src/test/resources/sample/plan-gen
3036
api/out
3137
api/src/test/resources/sample/documentation
3238

39+
# UI
3340
ui/node_modules
3441
ui/coverage
3542

43+
# Docs build caches
44+
.docs
45+
docs/site
46+
docs/.cache
47+
docs/.venv
48+
49+
# Example module
50+
example/build
51+
example/.gradle
52+
example/benchmark/build
53+
example/benchmark/jars
54+
3655
*.class
3756
*.log

api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -966,6 +966,14 @@ case class FieldBuilder(field: Field = Field()) {
966966
def omit(omit: Boolean): FieldBuilder =
967967
this.modify(_.field.options).setTo(getGenBuilder.omit(omit).options)
968968

969+
/**
970+
* Marks this field such that if it is the sole top-level field and is an array in a JSON task, the sink should
971+
* output a bare JSON array (no enclosing object with the field name).
972+
* Has no effect for non-JSON sinks or non-top-level contexts.
973+
*/
974+
def unwrapTopLevelArray(enable: Boolean): FieldBuilder =
975+
this.modify(_.field.options).setTo(getGenBuilder.unwrapTopLevel(enable).options)
976+
969977
/**
970978
* Sets the primary key flag for the current field.
971979
*
@@ -1536,6 +1544,12 @@ case class GeneratorBuilder(options: Map[String, Any] = Map()) {
15361544
def omit(omit: Boolean): GeneratorBuilder =
15371545
this.modify(_.options)(_ ++ Map(OMIT -> omit.toString))
15381546

1547+
/**
1548+
* Instruct JSON sink to unwrap the top-level field if it is a single array field.
1549+
*/
1550+
def unwrapTopLevel(enable: Boolean): GeneratorBuilder =
1551+
this.modify(_.options)(_ ++ Map(UNWRAP_TOP_LEVEL -> enable.toString))
1552+
15391553
/**
15401554
* Field is a primary key of the data source.
15411555
*

api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -996,9 +996,25 @@ case class FieldValidationBuilder(validationBuilder: ValidationBuilder = Validat
996996
private def removeTicksField: String = field.replaceAll("`", "")
997997

998998
private def seqToString(seq: Seq[Any]): String = {
999-
seq.head match {
1000-
case _: String => seq.mkString("'", "','", "'")
1001-
case _ => seq.mkString(",")
999+
// If all values are numeric or numeric-looking strings, render as unquoted numbers to form ARRAY<DOUBLE/NUMERIC>
1000+
val allNumericLike = seq.forall {
1001+
case s: String => Try(s.trim.toDouble).isSuccess
1002+
case n: java.lang.Number => true
1003+
case n: Number => true
1004+
case _ => false
1005+
}
1006+
if (allNumericLike) {
1007+
seq.map {
1008+
case s: String => BigDecimal(s.trim).toString()
1009+
case n: java.lang.Number => n.toString
1010+
case n: Number => n.toString
1011+
case other => other.toString
1012+
}.mkString(",")
1013+
} else {
1014+
seq.head match {
1015+
case _: String => seq.mkString("'", "','", "'")
1016+
case _ => seq.mkString(",")
1017+
}
10021018
}
10031019
}
10041020
}

api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ object Constants {
6161
lazy val DATA_CONTRACT_FILE = "dataContractFile"
6262
lazy val DATA_CONTRACT_SCHEMA = "dataContractSchema"
6363
lazy val ROWS_PER_SECOND = "rowsPerSecond"
64+
// When applied on a top-level array field in a JSON task, instructs the sink to output a bare JSON array
65+
// instead of an object { fieldName: [...] }.
66+
lazy val UNWRAP_TOP_LEVEL = "unwrapTopLevel"
6467
lazy val HUDI_TABLE_NAME = "hoodie.table.name"
6568
lazy val ICEBERG_CATALOG_TYPE = "catalogType"
6669
lazy val ICEBERG_CATALOG_URI = "catalogUri"

app/src/main/scala/io/github/datacatering/datacaterer/core/sink/SinkFactory.scala

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package io.github.datacatering.datacaterer.core.sink
22

33
import com.google.common.util.concurrent.RateLimiter
4-
import io.github.datacatering.datacaterer.api.model.Constants.{DELTA, DELTA_LAKE_SPARK_CONF, DRIVER, FORMAT, ICEBERG, ICEBERG_SPARK_CONF, JDBC, OMIT, PARTITIONS, PARTITION_BY, POSTGRES_DRIVER, RATE, ROWS_PER_SECOND, SAVE_MODE, TABLE}
4+
import io.github.datacatering.datacaterer.api.model.Constants.{DELTA, DELTA_LAKE_SPARK_CONF, DRIVER, FORMAT, ICEBERG, ICEBERG_SPARK_CONF, JDBC, JSON, OMIT, PARTITIONS, PARTITION_BY, PATH, POSTGRES_DRIVER, RATE, ROWS_PER_SECOND, SAVE_MODE, TABLE, UNWRAP_TOP_LEVEL}
55
import io.github.datacatering.datacaterer.api.model.{FlagsConfig, FoldersConfig, MetadataConfig, SinkResult, Step}
66
import io.github.datacatering.datacaterer.api.util.ConfigUtil
77
import io.github.datacatering.datacaterer.core.exception.{FailedSaveDataDataFrameV2Exception, FailedSaveDataException}
@@ -91,6 +91,10 @@ class SinkFactory(
9191
.foreach(conf => df.sqlContext.setConf(conf._1, conf._2))
9292
val trySaveData = if (format == ICEBERG) {
9393
Try(tryPartitionAndSaveDfV2(df, saveMode, connectionConfig))
94+
} else if (format == JSON) {
95+
// Special-case: allow unwrapping top-level array to emit a bare JSON array file
96+
val tryMaybeUnwrap = Try(trySaveJsonPossiblyUnwrapped(df, saveMode, connectionConfig))
97+
tryMaybeUnwrap
9498
} else {
9599
val partitionedDf = partitionDf(df, connectionConfig)
96100
Try(partitionedDf
@@ -112,6 +116,36 @@ class SinkFactory(
112116
mapToSinkResult(dataSourceName, df, saveMode, connectionConfig, count, format, trySaveData.isSuccess, startTime, optException)
113117
}
114118

119+
private def trySaveJsonPossiblyUnwrapped(df: DataFrame, saveMode: SaveMode, connectionConfig: Map[String, String]): Unit = {
120+
val shouldUnwrap = detectTopLevelArrayToUnwrap(df)
121+
shouldUnwrap match {
122+
case Some(arrayFieldName) =>
123+
// Write a single file containing the JSON array string using Spark text writer
124+
// We keep directory semantics consistent with other sinks
125+
val path = connectionConfig.getOrElse(PATH, throw new IllegalArgumentException("Missing path for JSON sink"))
126+
val jsonArrayDf = df.selectExpr(s"TO_JSON(`" + arrayFieldName + "`) AS value").coalesce(1)
127+
jsonArrayDf.write.mode(saveMode).text(path)
128+
case None =>
129+
// Default JSON behavior
130+
val partitionedDf = partitionDf(df, connectionConfig)
131+
partitionedDf
132+
.format(JSON)
133+
.mode(saveMode)
134+
.options(connectionConfig)
135+
.save()
136+
}
137+
}
138+
139+
private def detectTopLevelArrayToUnwrap(df: DataFrame): Option[String] = {
140+
val fields = df.schema.fields
141+
if (fields.length == 1) {
142+
val f = fields.head
143+
val hasFlag = f.metadata.contains(UNWRAP_TOP_LEVEL) && f.metadata.getString(UNWRAP_TOP_LEVEL).equalsIgnoreCase("true")
144+
val isArray = f.dataType.typeName == "array"
145+
if (hasFlag && isArray) Some(f.name) else None
146+
} else None
147+
}
148+
115149
private def partitionDf(df: DataFrame, stepOptions: Map[String, String]): DataFrameWriter[Row] = {
116150
val partitionDf = stepOptions.get(PARTITIONS)
117151
.map(partitionNum => df.repartition(partitionNum.toInt)).getOrElse(df)

0 commit comments

Comments
 (0)