data-catering
diff --git a/‎.github/workflows/benchmark.yml‎
Lines changed: 57 additions & 10 deletions b/‎.github/workflows/benchmark.yml‎
Lines changed: 57 additions & 10 deletions
diff --git a/‎.github/workflows/check.yml‎
Lines changed: 8 additions & 1 deletion b/‎.github/workflows/check.yml‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 139 additions & 15 deletions b/‎CLAUDE.md‎
Lines changed: 139 additions & 15 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala‎
Lines changed: 1 addition & 3 deletions b/‎api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala‎
Lines changed: 2 additions & 1 deletion b/‎api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala‎
Lines changed: 2 additions & 1 deletion
@@ -5,8 +5,18 @@ on:
     workflows: ["Build docker images"]
     types:
       - completed
-  # Allow manual triggering for testing
+  # Allow manual triggering with custom version
   workflow_dispatch:
+    inputs:
+      version:
+        description: 'Data Caterer version to benchmark (e.g., 0.17.0)'
+        required: false
+        type: string
+      skip_existence_check:
+        description: 'Skip check for existing benchmark results'
+        required: false
+        type: boolean
+        default: false
 
 jobs:
   build:
@@ -22,24 +32,51 @@ jobs:
       run:
         working-directory: example
     steps:
+      - name: Get branch name
+        id: branch
+        run: |
+          if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
+            echo "name=${{ github.ref_name }}" >> $GITHUB_OUTPUT
+          else
+            echo "name=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
+          fi
       - name: Checkout monorepo
         uses: actions/checkout@v4
         with:
-          # For workflow_run, we need to checkout the specific commit that triggered the build
-          ref: ${{ github.event.workflow_run.head_sha || github.sha }}
+          # Checkout the branch, not the commit, to avoid detached HEAD
+          ref: ${{ steps.branch.outputs.name }}
           fetch-depth: 2
+      - name: Determine version to benchmark
+        id: benchmark_version
+        run: |
+          if [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ inputs.version }}" ]; then
+            echo "value=${{ inputs.version }}" >> $GITHUB_OUTPUT
+            echo "Using manually specified version: ${{ inputs.version }}"
+          else
+            version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
+            echo "value=${version}" >> $GITHUB_OUTPUT
+            echo "Using version from gradle.properties: ${version}"
+          fi
       - name: Check if benchmark has already run
         run: |
-          version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
+          skip_check="${{ inputs.skip_existence_check }}"
+          version="${{ steps.benchmark_version.outputs.value }}"
+
+          if [ "$skip_check" == "true" ]; then
+            echo "Skipping existence check as requested"
+            exit 0
+          fi
+
           if [ ! -f benchmark/results/benchmark_results_${version}.txt ]; then
               echo "No benchmark results for version: $version, starting to run benchmarks"
           else
-              echo "Benchmarks already run!"
+              echo "Benchmarks already run for version: $version!"
+              echo "Set 'skip_existence_check' to true to re-run anyway"
               exit 1
           fi
       - name: Wait for Docker image to be available
         run: |
-          version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
+          version="${{ steps.benchmark_version.outputs.value }}"
           echo "Waiting for Docker image datacatering/data-caterer:${version} to be available..."
           max_attempts=10
           attempt=1
@@ -53,7 +90,7 @@ jobs:
               ((attempt++))
             fi
           done
-          
+
           if [ $attempt -gt $max_attempts ]; then
             echo "ERROR: Docker image not available after $max_attempts attempts"
             exit 1
@@ -67,11 +104,21 @@ jobs:
       - name: Get Spark query engine jars
         run: bash benchmark/setup_query_engine_jars.sh
       - name: Run benchmark script
+        env:
+          BENCHMARK_VERSION: ${{ steps.benchmark_version.outputs.value }}
         run: |
-          version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
           bash benchmark/run_benchmark.sh
-          bash benchmark/compare_benchmark_results.sh "$version"
+          bash benchmark/compare_benchmark_results.sh "${{ steps.benchmark_version.outputs.value }}"
       - name: Create pull request
         uses: peter-evans/create-pull-request@v6
         with:
-          title: Add benchmark results
+          title: Add benchmark results for version ${{ steps.benchmark_version.outputs.value }}
+          body: |
+            Automated benchmark results for Data Caterer version ${{ steps.benchmark_version.outputs.value }}
+
+            This PR adds benchmark performance metrics comparing different configurations.
+
+            Triggered by: ${{ github.event_name }}
+          branch: benchmark-results-${{ steps.benchmark_version.outputs.value }}
+          base: ${{ steps.branch.outputs.name }}
+          commit-message: Add benchmark results for version ${{ steps.benchmark_version.outputs.value }}
@@ -20,7 +20,10 @@ jobs:
         uses: burrunan/gradle-cache-action@v1
         with:
           arguments: ":app:shadowJar"
-      - name: Run integration tests
+      - name: Run gradle integration tests
+        run: |
+          ./gradlew :app:integrationTest --info
+      - name: Run intsa-integration tests
         id: tests
         uses: data-catering/insta-integration@v4
       - name: Print results
@@ -30,3 +33,7 @@ jobs:
           echo "Failed validations:        ${{ steps.tests.outputs.num_failed_validations }}"
           echo "Number of validations:     ${{ steps.tests.outputs.num_validations }}"
           echo "Validation success rate:   ${{ steps.tests.outputs.validation_success_rate }}"
+
+          if [ "${{ steps.tests.outputs.num_failed_validations }}" -gt 0 ]; then
+            exit 1
+          fi
@@ -22,6 +22,7 @@ site
 .venv
 .python-version
 
+app/docs
 app/out
 app/src/test/resources/sample/parquet
 app/src/test/resources/sample/json
 
@@ -31,12 +31,24 @@ The project uses Gradle with Kotlin DSL and follows a multi-module structure:
 ./gradlew :app:test --tests "io.github.datacatering.datacaterer.core.ui.plan.PlanRepositoryTest" --info
 ./gradlew :api:test
 
+# Run integration tests (slower, more comprehensive)
+./gradlew :app:integrationTest --tests "io.github.datacatering.datacaterer.core.ui.plan.YamlPlanIntegrationTest" --info
+
+# Run performance tests (for benchmarking)
+./gradlew :app:performanceTest --tests "io.github.datacatering.datacaterer.core.util.ForeignKeyUtilPerformanceTest" --info
+
 # Generate test coverage with Scoverage
 ./gradlew reportScoverage
 
 # Create fat/shadow JAR for distribution
 ./gradlew :app:shadowJar
 
+# Run UI server (standalone mode)
+./gradlew :app:runUI
+
+# Run Spark job mode
+./gradlew :app:runSpark
+
 # Run specific configurations from IDE
 ./gradlew :app:run --args="DataCatererUI"
 ```
@@ -47,6 +59,11 @@ ScalaTest with JUnit Platform has limitations with Gradle's `--tests` filtering:
 - ✅ Use exact class names: `--tests "io.github.datacatering.datacaterer.core.ui.plan.PlanRepositoryTest"`
 - ❌ Do NOT use wildcards: `--tests "*PlanRunTest*"` (runs ALL tests instead of filtering)
 
+**Test Types**:
+- **Unit tests** (`app/src/test`): Fast, isolated tests for individual components
+- **Integration tests** (`app/src/integrationTest`): Slower tests that verify end-to-end workflows (e.g., YAML plan processing)
+- **Performance tests** (`app/src/performanceTest`): Benchmarking tests for data generation and foreign key performance
+
 ## Architecture Overview
 
 ### Core Domain Concepts
@@ -70,9 +87,20 @@ app/                          # Core application
 │   ├── generator/           # Data generation engine
 │   ├── validator/           # Data validation engine
 │   ├── sink/               # Data output processors
-│   ├── metadata/           # Metadata discovery and integration
+│   ├── plan/               # Plan and task processing
 │   ├── ui/                 # Web UI server components
-│   └── util/               # Utilities and helpers
+│   │   ├── cache/          # Caching layer for UI
+│   │   ├── http/           # HTTP endpoints and routing
+│   │   ├── plan/           # Plan repository and management
+│   │   ├── resource/       # Resource management
+│   │   ├── sample/         # Sample data generation
+│   │   └── service/        # Business logic services
+│   ├── util/               # Utilities and helpers
+│   ├── alert/              # Alert/notification system
+│   ├── config/             # Configuration management
+│   ├── listener/           # Event listeners
+│   ├── model/              # Core data models
+│   └── parser/             # Parser utilities
 └── main/resources/          # Configuration files and UI assets
 ```
 
@@ -81,8 +109,11 @@ app/                          # Core application
 **Builder Pattern**: All configuration uses immutable builders with method chaining
 ```scala
 postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer")
-  .table("accounts")  
-  .fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true))
+  .table("accounts")
+  .fields(
+    field.name("account_id").regex("ACC[0-9]{8}").unique(true),
+    field.name("status").regex("(ACTIVE|INACTIVE|PENDING)")
+  )
 ```
 
 **Case Class Data Models**: Immutable data structures with Jackson JSON serialization
@@ -127,9 +158,16 @@ case class TaskBuilder(task: Task = Task()) {
 Runtime behavior is controlled via environment variables:
 - `ENABLE_GENERATE_DATA`: Enable/disable data generation
 - `ENABLE_DELETE_GENERATED_RECORDS`: Enable cleanup mode
+- `ENABLE_GENERATE_PLAN_AND_TASKS`: Enable metadata-driven plan/task generation
+- `ENABLE_RECORD_TRACKING`: Enable tracking of generated records for cleanup
 - `PLAN_FILE_PATH`: Path to YAML plan configuration
 - `TASK_FOLDER_PATH`: Directory containing task definitions
 - `APPLICATION_CONFIG_PATH`: Custom application configuration
+- `GENERATED_REPORTS_FOLDER_PATH`: Output directory for reports (default: `/tmp/data-caterer/report`)
+- `LOG_LEVEL`: Logging level (`debug`, `info`, `warn`, `error`)
+
+Configuration flags control performance optimizations:
+- `enableFastGeneration`: Enable fast mode (pure SQL generation without UDFs) - default: `false`
 
 ### Data Source Support
 
@@ -140,30 +178,116 @@ The system supports:
 - **HTTP**: REST APIs with OpenAPI/Swagger integration
 - **Metadata Sources**: Great Expectations, JSON Schema, Data Contract CLI, OpenMetadata, Marquez
 
+## Data Generation
+
+### Regex Patterns
+
+Fields can use regex patterns for data generation. The system uses an intelligent SQL-based approach by default:
+
+**Default Regex Generation** (always enabled):
+- Automatically parses regex patterns into efficient SQL expressions (no UDFs)
+- Supports common business patterns: `\d`, `[A-Z]`, `[0-9]`, quantifiers `{n}`, `{m,n}`, alternations `(A|B|C)`
+- **Automatically falls back to UDF** for unsupported patterns (backreferences, lookaheads, etc.)
+- No configuration needed - just use `.regex()` and the system chooses the best approach
+
+```scala
+// These patterns use pure SQL generation (fast)
+field.name("account_id").regex("ACC[0-9]{8}")              // → CONCAT('ACC', LPAD(...))
+field.name("product_code").regex("[A-Z]{3}-[0-9]{2}")     // → CONCAT(letters, '-', digits)
+field.name("status").regex("(ACTIVE|INACTIVE|PENDING)")   // → ELEMENT_AT(ARRAY(...), RAND())
+field.name("serial").regex("[A-Z0-9]{16}")                // → Alphanumeric generation
+
+// Complex patterns automatically fall back to UDF (still works correctly)
+field.name("complex").regex("(?=lookahead)pattern")       // → Uses GENERATE_REGEX UDF
+```
+
+**Implementation**: Regex patterns are parsed using `RegexPatternParser` (in `core.generator.provider.regex` package) which converts supported patterns to an AST and generates pure SQL. Unsupported patterns automatically fall back to DataFaker's `regexify()` UDF. Parsing happens once during generator initialization with success/failure logged at DEBUG/WARN levels.
+
 ## UI and API Integration
 
 The application includes a web UI server that provides:
 - Connection management and testing
 - Interactive plan creation
 - Execution history tracking
 - Real-time results viewing
+- Sample data generation
+
+The UI is implemented within the app module at `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` with:
+- **Frontend**: Static UI assets in `app/src/main/resources/ui/`
+- **Backend**: Scala-based HTTP server using Apache Pekko (HTTP4S-like framework)
+- **API Endpoints**: RESTful endpoints for connections, plans, tasks, and execution management
+- **Caching**: In-memory caching layer for improved performance
 
-The UI is implemented as a separate module with React frontend and Scala backend using HTTP4S.
+To run the UI server:
+```bash
+./gradlew :app:runUI
+# or
+DEPLOY_MODE=standalone ./gradlew :app:run --args="DataCatererUI"
+```
 
 ## Testing Strategy
 
-- Use ScalaTest for unit testing
+- **Unit tests** (`app/src/test`): Fast, isolated tests using ScalaTest with Mockito for mocking
+- **Integration tests** (`app/src/integrationTest`): End-to-end tests for YAML processing, plan execution, and API workflows
+- **Performance tests** (`app/src/performanceTest`): Benchmarking for data generation performance and optimization validation
 - Test both API builders and core application logic
-- Mock external dependencies (databases, file systems)
-- Use exact class names for test filtering, not wildcards
-- Leverage the example module for integration testing
+- Mock external dependencies (databases, file systems) in unit tests
+- Use exact class names for test filtering, NOT wildcards
+- Leverage the example module for real-world integration scenarios
+
+**Running Tests**:
+```bash
+# Unit tests only
+./gradlew :app:test
+
+# Integration tests
+./gradlew :app:integrationTest
+
+# Performance tests
+./gradlew :app:performanceTest
+
+# Specific test class
+./gradlew :app:test --tests "io.github.datacatering.datacaterer.core.generator.DataGeneratorFactoryTest"
+```
 
 ## Key Dependencies
 
 - **Scala**: 2.12.x
-- **Apache Spark**: 3.5.x
-- **Jackson**: JSON serialization
-- **Quicklens**: Immutable data updates
-- **ScalaTest**: Testing framework
-- **HTTP4S**: Web server framework
-- **Logback/Log4j**: Logging
+- **Apache Spark**: 3.5.x (core data processing engine)
+- **Jackson**: JSON/YAML serialization (2.15.3)
+- **Quicklens**: Immutable data updates in builders
+- **ScalaTest**: Testing framework with JUnit Platform runner
+- **Apache Pekko**: Web server framework (HTTP/Actor system)
+- **DataFaker**: Data generation library for realistic fake data
+- **PureConfig**: Type-safe configuration loading
+- **Logback**: Logging framework
+- **Various connectors**: Postgres, MySQL, Cassandra, Kafka, BigQuery, Delta Lake, Iceberg, etc.
+
+## Performance Optimization
+
+Data Caterer includes several performance optimizations:
+
+**Fast Generation Mode** (`enableFastGeneration: true`):
+- Converts regex patterns to pure SQL expressions (avoiding UDF overhead)
+- Dramatically improves generation speed for large datasets
+- Automatically falls back to UDF for unsupported patterns
+- Recommended for production workloads with regex-based field generation
+
+**Foreign Key Optimization**:
+- Efficient foreign key relationship handling for referential integrity
+- Optimized sampling and distribution strategies
+- Performance testing infrastructure in `app/src/performanceTest`
+
+**Configuration**:
+```scala
+// In Scala API
+config
+  .generatedReportsFolderPath("/tmp/reports")
+  .enableFastGeneration(true)  // Enable SQL-based regex generation
+```
+
+```yaml
+# In YAML configuration
+flags:
+  enableFastGeneration: true
+```
@@ -38,7 +38,7 @@ and deep dive into issues [from the generated report](https://data.catering/late
 
 1. Docker
    ```shell
-   docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.17.0
+   docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.17.1
    ```
    [Open localhost:9898](http://localhost:9898).
 1. [Run Scala/Java examples](#run-scalajava-examples)
 
@@ -999,15 +999,13 @@ case class FieldValidationBuilder(validationBuilder: ValidationBuilder = Validat
     // If all values are numeric or numeric-looking strings, render as unquoted numbers to form ARRAY<DOUBLE/NUMERIC>
     val allNumericLike = seq.forall {
       case s: String => Try(s.trim.toDouble).isSuccess
-      case n: java.lang.Number => true
-      case n: Number => true
+      case _: java.lang.Number => true
       case _ => false
     }
     if (allNumericLike) {
       seq.map {
         case s: String => BigDecimal(s.trim).toString()
         case n: java.lang.Number => n.toString
-        case n: Number => n.toString
         case other => other.toString
       }.mkString(",")
     } else {
 
@@ -16,7 +16,8 @@ case class FlagsConfig(
                         enableGenerateValidations: Boolean = DEFAULT_ENABLE_SUGGEST_VALIDATIONS,
                         enableAlerts: Boolean = DEFAULT_ENABLE_ALERTS,
                         enableUniqueCheckOnlyInBatch: Boolean = DEFAULT_ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH,
-                        enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION
+                        enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION,
+                        enableForeignKeyV2: Boolean = DEFAULT_ENABLE_FOREIGN_KEY_V2
                       )
 
 case class FoldersConfig(
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,8 @@ case class FlagsConfig(`
`16`	`16`	`enableGenerateValidations: Boolean = DEFAULT_ENABLE_SUGGEST_VALIDATIONS,`
`17`	`17`	`enableAlerts: Boolean = DEFAULT_ENABLE_ALERTS,`
`18`	`18`	`enableUniqueCheckOnlyInBatch: Boolean = DEFAULT_ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH,`
`19`		`- enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION`
	`19`	`+ enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION,`
	`20`	`+ enableForeignKeyV2: Boolean = DEFAULT_ENABLE_FOREIGN_KEY_V2`
`20`	`21`	`)`
`21`	`22`
`22`	`23`	`case class FoldersConfig(`