data-catering
diff --git a/‎.github/workflows/benchmark.yml‎
Lines changed: 46 additions & 0 deletions b/‎.github/workflows/benchmark.yml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎.github/workflows/docs-ci.yml‎
Lines changed: 40 additions & 0 deletions b/‎.github/workflows/docs-ci.yml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 19 additions & 0 deletions b/‎.gitignore‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala‎
Lines changed: 14 additions & 0 deletions b/‎api/src/main/scala/io/github/datacatering/datacaterer/api/TaskBuilder.scala‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala‎
Lines changed: 19 additions & 3 deletions b/‎api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala‎
Lines changed: 3 additions & 0 deletions b/‎api/src/main/scala/io/github/datacatering/datacaterer/api/model/Constants.scala‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎app/src/main/scala/io/github/datacatering/datacaterer/core/sink/SinkFactory.scala‎
Lines changed: 35 additions & 1 deletion b/‎app/src/main/scala/io/github/datacatering/datacaterer/core/sink/SinkFactory.scala‎
Lines changed: 35 additions & 1 deletion
@@ -0,0 +1,46 @@
+name: Run performance benchmark tests
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: example
+    steps:
+      - name: Checkout monorepo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Check if benchmark has already run
+        run: |
+          version=$(grep dataCatererVersion gradle.properties | cut -d= -f2)
+          if [ ! -f benchmark/results/benchmark_results_${version}.txt ]; then
+              echo "No benchmark results for version: $version, starting to run benchmarks"
+          else
+              echo "Benchmarks already run!"
+              exit 1
+          fi
+      - name: Checkout datafusion-comet repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+          repository: apache/datafusion-comet
+          path: example/benchmark/build/datafusion-comet
+      - name: Get Spark query engine jars
+        run: bash benchmark/setup_query_engine_jars.sh
+      - name: Run benchmark script
+        env:
+          DATA_CATERER_MANAGEMENT_TRACK: ${{ secrets.DATA_CATERER_MANAGEMENT_TRACK }}
+        run: |
+          version=$(grep dataCatererVersion gradle.properties | cut -d= -f2)
+          bash benchmark/run_benchmark.sh
+          bash benchmark/compare_benchmark_results.sh "$version"
+      - name: Create pull request
+        uses: peter-evans/create-pull-request@v6
+        with:
+          title: Add benchmark results
@@ -0,0 +1,40 @@
+name: docs
+
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: write
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.x'
+      - name: Configure Git Credentials
+        run: |
+          git config user.name github-actions[bot]
+          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+      - name: Set cache key
+        run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: sudo apt-get update && sudo apt-get install -y pngquant
+      - run: pip install mkdocs-material mkdocs-open-in-new-tab "mkdocs-material[imaging]" mike
+      - name: Deploy docs with mike
+        run: |
+          latest_tag=$(git describe --tags --abbrev=0)
+          mike deploy --push --update-aliases "$latest_tag" latest
+    env:
+      GH_TOKEN: ${{ secrets.GH_TOKEN }}
@@ -15,6 +15,12 @@ tmp
 .bloop
 .metals
 .vscode
+site
+.cache
+
+# Python/virtualenvs used by docs
+.venv
+.python-version
 
 app/out
 app/src/test/resources/sample/parquet
@@ -30,8 +36,21 @@ app/src/test/resources/sample/plan-gen
 api/out
 api/src/test/resources/sample/documentation
 
+# UI
 ui/node_modules
 ui/coverage
 
+# Docs build caches
+.docs
+docs/site
+docs/.cache
+docs/.venv
+
+# Example module
+example/build
+example/.gradle
+example/benchmark/build
+example/benchmark/jars
+
 *.class
 *.log
@@ -966,6 +966,14 @@ case class FieldBuilder(field: Field = Field()) {
   def omit(omit: Boolean): FieldBuilder =
     this.modify(_.field.options).setTo(getGenBuilder.omit(omit).options)
 
+  /**
+    * Marks this field such that if it is the sole top-level field and is an array in a JSON task, the sink should
+    * output a bare JSON array (no enclosing object with the field name).
+    * Has no effect for non-JSON sinks or non-top-level contexts.
+    */
+  def unwrapTopLevelArray(enable: Boolean): FieldBuilder =
+    this.modify(_.field.options).setTo(getGenBuilder.unwrapTopLevel(enable).options)
+
   /**
    * Sets the primary key flag for the current field.
    *
@@ -1536,6 +1544,12 @@ case class GeneratorBuilder(options: Map[String, Any] = Map()) {
   def omit(omit: Boolean): GeneratorBuilder =
     this.modify(_.options)(_ ++ Map(OMIT -> omit.toString))
 
+  /**
+    * Instruct JSON sink to unwrap the top-level field if it is a single array field.
+    */
+  def unwrapTopLevel(enable: Boolean): GeneratorBuilder =
+    this.modify(_.options)(_ ++ Map(UNWRAP_TOP_LEVEL -> enable.toString))
+
   /**
    * Field is a primary key of the data source.
    *
 
@@ -996,9 +996,25 @@ case class FieldValidationBuilder(validationBuilder: ValidationBuilder = Validat
   private def removeTicksField: String = field.replaceAll("`", "")
 
   private def seqToString(seq: Seq[Any]): String = {
-    seq.head match {
-      case _: String => seq.mkString("'", "','", "'")
-      case _ => seq.mkString(",")
+    // If all values are numeric or numeric-looking strings, render as unquoted numbers to form ARRAY<DOUBLE/NUMERIC>
+    val allNumericLike = seq.forall {
+      case s: String => Try(s.trim.toDouble).isSuccess
+      case n: java.lang.Number => true
+      case n: Number => true
+      case _ => false
+    }
+    if (allNumericLike) {
+      seq.map {
+        case s: String => BigDecimal(s.trim).toString()
+        case n: java.lang.Number => n.toString
+        case n: Number => n.toString
+        case other => other.toString
+      }.mkString(",")
+    } else {
+      seq.head match {
+        case _: String => seq.mkString("'", "','", "'")
+        case _ => seq.mkString(",")
+      }
     }
   }
 }
 
@@ -61,6 +61,9 @@ object Constants {
   lazy val DATA_CONTRACT_FILE = "dataContractFile"
   lazy val DATA_CONTRACT_SCHEMA = "dataContractSchema"
   lazy val ROWS_PER_SECOND = "rowsPerSecond"
+  // When applied on a top-level array field in a JSON task, instructs the sink to output a bare JSON array
+  // instead of an object { fieldName: [...] }.
+  lazy val UNWRAP_TOP_LEVEL = "unwrapTopLevel"
   lazy val HUDI_TABLE_NAME = "hoodie.table.name"
   lazy val ICEBERG_CATALOG_TYPE = "catalogType"
   lazy val ICEBERG_CATALOG_URI = "catalogUri"
 
@@ -1,7 +1,7 @@
 package io.github.datacatering.datacaterer.core.sink
 
 import com.google.common.util.concurrent.RateLimiter
-import io.github.datacatering.datacaterer.api.model.Constants.{DELTA, DELTA_LAKE_SPARK_CONF, DRIVER, FORMAT, ICEBERG, ICEBERG_SPARK_CONF, JDBC, OMIT, PARTITIONS, PARTITION_BY, POSTGRES_DRIVER, RATE, ROWS_PER_SECOND, SAVE_MODE, TABLE}
+import io.github.datacatering.datacaterer.api.model.Constants.{DELTA, DELTA_LAKE_SPARK_CONF, DRIVER, FORMAT, ICEBERG, ICEBERG_SPARK_CONF, JDBC, JSON, OMIT, PARTITIONS, PARTITION_BY, PATH, POSTGRES_DRIVER, RATE, ROWS_PER_SECOND, SAVE_MODE, TABLE, UNWRAP_TOP_LEVEL}
 import io.github.datacatering.datacaterer.api.model.{FlagsConfig, FoldersConfig, MetadataConfig, SinkResult, Step}
 import io.github.datacatering.datacaterer.api.util.ConfigUtil
 import io.github.datacatering.datacaterer.core.exception.{FailedSaveDataDataFrameV2Exception, FailedSaveDataException}
@@ -91,6 +91,10 @@ class SinkFactory(
       .foreach(conf => df.sqlContext.setConf(conf._1, conf._2))
     val trySaveData = if (format == ICEBERG) {
       Try(tryPartitionAndSaveDfV2(df, saveMode, connectionConfig))
+    } else if (format == JSON) {
+      // Special-case: allow unwrapping top-level array to emit a bare JSON array file
+      val tryMaybeUnwrap = Try(trySaveJsonPossiblyUnwrapped(df, saveMode, connectionConfig))
+      tryMaybeUnwrap
     } else {
       val partitionedDf = partitionDf(df, connectionConfig)
       Try(partitionedDf
@@ -112,6 +116,36 @@ class SinkFactory(
     mapToSinkResult(dataSourceName, df, saveMode, connectionConfig, count, format, trySaveData.isSuccess, startTime, optException)
   }
 
+  private def trySaveJsonPossiblyUnwrapped(df: DataFrame, saveMode: SaveMode, connectionConfig: Map[String, String]): Unit = {
+    val shouldUnwrap = detectTopLevelArrayToUnwrap(df)
+    shouldUnwrap match {
+      case Some(arrayFieldName) =>
+        // Write a single file containing the JSON array string using Spark text writer
+        // We keep directory semantics consistent with other sinks
+        val path = connectionConfig.getOrElse(PATH, throw new IllegalArgumentException("Missing path for JSON sink"))
+        val jsonArrayDf = df.selectExpr(s"TO_JSON(`" + arrayFieldName + "`) AS value").coalesce(1)
+        jsonArrayDf.write.mode(saveMode).text(path)
+      case None =>
+        // Default JSON behavior
+        val partitionedDf = partitionDf(df, connectionConfig)
+        partitionedDf
+          .format(JSON)
+          .mode(saveMode)
+          .options(connectionConfig)
+          .save()
+    }
+  }
+
+  private def detectTopLevelArrayToUnwrap(df: DataFrame): Option[String] = {
+    val fields = df.schema.fields
+    if (fields.length == 1) {
+      val f = fields.head
+      val hasFlag = f.metadata.contains(UNWRAP_TOP_LEVEL) && f.metadata.getString(UNWRAP_TOP_LEVEL).equalsIgnoreCase("true")
+      val isArray = f.dataType.typeName == "array"
+      if (hasFlag && isArray) Some(f.name) else None
+    } else None
+  }
+
   private def partitionDf(df: DataFrame, stepOptions: Map[String, String]): DataFrameWriter[Row] = {
     val partitionDf = stepOptions.get(PARTITIONS)
       .map(partitionNum => df.repartition(partitionNum.toInt)).getOrElse(df)