Skip to content

Commit d15a73c

Browse files
authored
Update version to 0.17.1, enhance performance with fast regex generat… (#111)
* Update version to 0.17.1, enhance performance with fast regex generation mode, improve UI service layer, and introduce comprehensive testing infrastructure. Add new examples and documentation for Docker multi-stage builds, along with updates to existing sample plans and tasks. * Enhance regex pattern handling in data generation by implementing automatic SQL conversion for supported patterns, with fallback to UDF for unsupported ones. Update documentation to reflect new default behavior and performance improvements. Improve integration tests with unique directories and actor names to prevent state conflicts. * Add in info for integration test logs in github actions * Enhance plan loading functionality by updating methods to accept custom plan folders and include configured paths. Add a new Kafka plan YAML file for integration tests to improve testing coverage. Update documentation with new pre-filter validation examples in Java, Scala, and YAML formats. * Remove thread sleeps and use proper pekko probes for testing
1 parent d13a919 commit d15a73c

72 files changed

Lines changed: 12237 additions & 1072 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/benchmark.yml

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,18 @@ on:
55
workflows: ["Build docker images"]
66
types:
77
- completed
8-
# Allow manual triggering for testing
8+
# Allow manual triggering with custom version
99
workflow_dispatch:
10+
inputs:
11+
version:
12+
description: 'Data Caterer version to benchmark (e.g., 0.17.0)'
13+
required: false
14+
type: string
15+
skip_existence_check:
16+
description: 'Skip check for existing benchmark results'
17+
required: false
18+
type: boolean
19+
default: false
1020

1121
jobs:
1222
build:
@@ -22,24 +32,51 @@ jobs:
2232
run:
2333
working-directory: example
2434
steps:
35+
- name: Get branch name
36+
id: branch
37+
run: |
38+
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
39+
echo "name=${{ github.ref_name }}" >> $GITHUB_OUTPUT
40+
else
41+
echo "name=${{ github.event.workflow_run.head_branch }}" >> $GITHUB_OUTPUT
42+
fi
2543
- name: Checkout monorepo
2644
uses: actions/checkout@v4
2745
with:
28-
# For workflow_run, we need to checkout the specific commit that triggered the build
29-
ref: ${{ github.event.workflow_run.head_sha || github.sha }}
46+
# Checkout the branch, not the commit, to avoid detached HEAD
47+
ref: ${{ steps.branch.outputs.name }}
3048
fetch-depth: 2
49+
- name: Determine version to benchmark
50+
id: benchmark_version
51+
run: |
52+
if [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ inputs.version }}" ]; then
53+
echo "value=${{ inputs.version }}" >> $GITHUB_OUTPUT
54+
echo "Using manually specified version: ${{ inputs.version }}"
55+
else
56+
version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
57+
echo "value=${version}" >> $GITHUB_OUTPUT
58+
echo "Using version from gradle.properties: ${version}"
59+
fi
3160
- name: Check if benchmark has already run
3261
run: |
33-
version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
62+
skip_check="${{ inputs.skip_existence_check }}"
63+
version="${{ steps.benchmark_version.outputs.value }}"
64+
65+
if [ "$skip_check" == "true" ]; then
66+
echo "Skipping existence check as requested"
67+
exit 0
68+
fi
69+
3470
if [ ! -f benchmark/results/benchmark_results_${version}.txt ]; then
3571
echo "No benchmark results for version: $version, starting to run benchmarks"
3672
else
37-
echo "Benchmarks already run!"
73+
echo "Benchmarks already run for version: $version!"
74+
echo "Set 'skip_existence_check' to true to re-run anyway"
3875
exit 1
3976
fi
4077
- name: Wait for Docker image to be available
4178
run: |
42-
version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
79+
version="${{ steps.benchmark_version.outputs.value }}"
4380
echo "Waiting for Docker image datacatering/data-caterer:${version} to be available..."
4481
max_attempts=10
4582
attempt=1
@@ -53,7 +90,7 @@ jobs:
5390
((attempt++))
5491
fi
5592
done
56-
93+
5794
if [ $attempt -gt $max_attempts ]; then
5895
echo "ERROR: Docker image not available after $max_attempts attempts"
5996
exit 1
@@ -67,11 +104,21 @@ jobs:
67104
- name: Get Spark query engine jars
68105
run: bash benchmark/setup_query_engine_jars.sh
69106
- name: Run benchmark script
107+
env:
108+
BENCHMARK_VERSION: ${{ steps.benchmark_version.outputs.value }}
70109
run: |
71-
version=$(grep -E "^version=" ../gradle.properties | cut -d= -f2)
72110
bash benchmark/run_benchmark.sh
73-
bash benchmark/compare_benchmark_results.sh "$version"
111+
bash benchmark/compare_benchmark_results.sh "${{ steps.benchmark_version.outputs.value }}"
74112
- name: Create pull request
75113
uses: peter-evans/create-pull-request@v6
76114
with:
77-
title: Add benchmark results
115+
title: Add benchmark results for version ${{ steps.benchmark_version.outputs.value }}
116+
body: |
117+
Automated benchmark results for Data Caterer version ${{ steps.benchmark_version.outputs.value }}
118+
119+
This PR adds benchmark performance metrics comparing different configurations.
120+
121+
Triggered by: ${{ github.event_name }}
122+
branch: benchmark-results-${{ steps.benchmark_version.outputs.value }}
123+
base: ${{ steps.branch.outputs.name }}
124+
commit-message: Add benchmark results for version ${{ steps.benchmark_version.outputs.value }}

.github/workflows/check.yml

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@ jobs:
2020
uses: burrunan/gradle-cache-action@v1
2121
with:
2222
arguments: ":app:shadowJar"
23-
- name: Run integration tests
23+
- name: Run gradle integration tests
24+
run: |
25+
./gradlew :app:integrationTest --info
26+
- name: Run intsa-integration tests
2427
id: tests
2528
uses: data-catering/insta-integration@v4
2629
- name: Print results
@@ -30,3 +33,7 @@ jobs:
3033
echo "Failed validations: ${{ steps.tests.outputs.num_failed_validations }}"
3134
echo "Number of validations: ${{ steps.tests.outputs.num_validations }}"
3235
echo "Validation success rate: ${{ steps.tests.outputs.validation_success_rate }}"
36+
37+
if [ "${{ steps.tests.outputs.num_failed_validations }}" -gt 0 ]; then
38+
exit 1
39+
fi

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ site
2222
.venv
2323
.python-version
2424

25+
app/docs
2526
app/out
2627
app/src/test/resources/sample/parquet
2728
app/src/test/resources/sample/json

CLAUDE.md

Lines changed: 139 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,24 @@ The project uses Gradle with Kotlin DSL and follows a multi-module structure:
3131
./gradlew :app:test --tests "io.github.datacatering.datacaterer.core.ui.plan.PlanRepositoryTest" --info
3232
./gradlew :api:test
3333

34+
# Run integration tests (slower, more comprehensive)
35+
./gradlew :app:integrationTest --tests "io.github.datacatering.datacaterer.core.ui.plan.YamlPlanIntegrationTest" --info
36+
37+
# Run performance tests (for benchmarking)
38+
./gradlew :app:performanceTest --tests "io.github.datacatering.datacaterer.core.util.ForeignKeyUtilPerformanceTest" --info
39+
3440
# Generate test coverage with Scoverage
3541
./gradlew reportScoverage
3642

3743
# Create fat/shadow JAR for distribution
3844
./gradlew :app:shadowJar
3945

46+
# Run UI server (standalone mode)
47+
./gradlew :app:runUI
48+
49+
# Run Spark job mode
50+
./gradlew :app:runSpark
51+
4052
# Run specific configurations from IDE
4153
./gradlew :app:run --args="DataCatererUI"
4254
```
@@ -47,6 +59,11 @@ ScalaTest with JUnit Platform has limitations with Gradle's `--tests` filtering:
4759
- ✅ Use exact class names: `--tests "io.github.datacatering.datacaterer.core.ui.plan.PlanRepositoryTest"`
4860
- ❌ Do NOT use wildcards: `--tests "*PlanRunTest*"` (runs ALL tests instead of filtering)
4961

62+
**Test Types**:
63+
- **Unit tests** (`app/src/test`): Fast, isolated tests for individual components
64+
- **Integration tests** (`app/src/integrationTest`): Slower tests that verify end-to-end workflows (e.g., YAML plan processing)
65+
- **Performance tests** (`app/src/performanceTest`): Benchmarking tests for data generation and foreign key performance
66+
5067
## Architecture Overview
5168

5269
### Core Domain Concepts
@@ -70,9 +87,20 @@ app/ # Core application
7087
│ ├── generator/ # Data generation engine
7188
│ ├── validator/ # Data validation engine
7289
│ ├── sink/ # Data output processors
73-
│ ├── metadata/ # Metadata discovery and integration
90+
│ ├── plan/ # Plan and task processing
7491
│ ├── ui/ # Web UI server components
75-
│ └── util/ # Utilities and helpers
92+
│ │ ├── cache/ # Caching layer for UI
93+
│ │ ├── http/ # HTTP endpoints and routing
94+
│ │ ├── plan/ # Plan repository and management
95+
│ │ ├── resource/ # Resource management
96+
│ │ ├── sample/ # Sample data generation
97+
│ │ └── service/ # Business logic services
98+
│ ├── util/ # Utilities and helpers
99+
│ ├── alert/ # Alert/notification system
100+
│ ├── config/ # Configuration management
101+
│ ├── listener/ # Event listeners
102+
│ ├── model/ # Core data models
103+
│ └── parser/ # Parser utilities
76104
└── main/resources/ # Configuration files and UI assets
77105
```
78106

@@ -81,8 +109,11 @@ app/ # Core application
81109
**Builder Pattern**: All configuration uses immutable builders with method chaining
82110
```scala
83111
postgres("customer_postgres", "jdbc:postgresql://localhost:5432/customer")
84-
.table("accounts")
85-
.fields(field.name("account_id").regex("ACC[0-9]{10}").unique(true))
112+
.table("accounts")
113+
.fields(
114+
field.name("account_id").regex("ACC[0-9]{8}").unique(true),
115+
field.name("status").regex("(ACTIVE|INACTIVE|PENDING)")
116+
)
86117
```
87118

88119
**Case Class Data Models**: Immutable data structures with Jackson JSON serialization
@@ -127,9 +158,16 @@ case class TaskBuilder(task: Task = Task()) {
127158
Runtime behavior is controlled via environment variables:
128159
- `ENABLE_GENERATE_DATA`: Enable/disable data generation
129160
- `ENABLE_DELETE_GENERATED_RECORDS`: Enable cleanup mode
161+
- `ENABLE_GENERATE_PLAN_AND_TASKS`: Enable metadata-driven plan/task generation
162+
- `ENABLE_RECORD_TRACKING`: Enable tracking of generated records for cleanup
130163
- `PLAN_FILE_PATH`: Path to YAML plan configuration
131164
- `TASK_FOLDER_PATH`: Directory containing task definitions
132165
- `APPLICATION_CONFIG_PATH`: Custom application configuration
166+
- `GENERATED_REPORTS_FOLDER_PATH`: Output directory for reports (default: `/tmp/data-caterer/report`)
167+
- `LOG_LEVEL`: Logging level (`debug`, `info`, `warn`, `error`)
168+
169+
Configuration flags control performance optimizations:
170+
- `enableFastGeneration`: Enable fast mode (pure SQL generation without UDFs) - default: `false`
133171

134172
### Data Source Support
135173

@@ -140,30 +178,116 @@ The system supports:
140178
- **HTTP**: REST APIs with OpenAPI/Swagger integration
141179
- **Metadata Sources**: Great Expectations, JSON Schema, Data Contract CLI, OpenMetadata, Marquez
142180

181+
## Data Generation
182+
183+
### Regex Patterns
184+
185+
Fields can use regex patterns for data generation. The system uses an intelligent SQL-based approach by default:
186+
187+
**Default Regex Generation** (always enabled):
188+
- Automatically parses regex patterns into efficient SQL expressions (no UDFs)
189+
- Supports common business patterns: `\d`, `[A-Z]`, `[0-9]`, quantifiers `{n}`, `{m,n}`, alternations `(A|B|C)`
190+
- **Automatically falls back to UDF** for unsupported patterns (backreferences, lookaheads, etc.)
191+
- No configuration needed - just use `.regex()` and the system chooses the best approach
192+
193+
```scala
194+
// These patterns use pure SQL generation (fast)
195+
field.name("account_id").regex("ACC[0-9]{8}") // → CONCAT('ACC', LPAD(...))
196+
field.name("product_code").regex("[A-Z]{3}-[0-9]{2}") // → CONCAT(letters, '-', digits)
197+
field.name("status").regex("(ACTIVE|INACTIVE|PENDING)") // → ELEMENT_AT(ARRAY(...), RAND())
198+
field.name("serial").regex("[A-Z0-9]{16}") // → Alphanumeric generation
199+
200+
// Complex patterns automatically fall back to UDF (still works correctly)
201+
field.name("complex").regex("(?=lookahead)pattern") // → Uses GENERATE_REGEX UDF
202+
```
203+
204+
**Implementation**: Regex patterns are parsed using `RegexPatternParser` (in `core.generator.provider.regex` package) which converts supported patterns to an AST and generates pure SQL. Unsupported patterns automatically fall back to DataFaker's `regexify()` UDF. Parsing happens once during generator initialization with success/failure logged at DEBUG/WARN levels.
205+
143206
## UI and API Integration
144207

145208
The application includes a web UI server that provides:
146209
- Connection management and testing
147210
- Interactive plan creation
148211
- Execution history tracking
149212
- Real-time results viewing
213+
- Sample data generation
214+
215+
The UI is implemented within the app module at `app/src/main/scala/io/github/datacatering/datacaterer/core/ui/` with:
216+
- **Frontend**: Static UI assets in `app/src/main/resources/ui/`
217+
- **Backend**: Scala-based HTTP server using Apache Pekko (HTTP4S-like framework)
218+
- **API Endpoints**: RESTful endpoints for connections, plans, tasks, and execution management
219+
- **Caching**: In-memory caching layer for improved performance
150220

151-
The UI is implemented as a separate module with React frontend and Scala backend using HTTP4S.
221+
To run the UI server:
222+
```bash
223+
./gradlew :app:runUI
224+
# or
225+
DEPLOY_MODE=standalone ./gradlew :app:run --args="DataCatererUI"
226+
```
152227

153228
## Testing Strategy
154229

155-
- Use ScalaTest for unit testing
230+
- **Unit tests** (`app/src/test`): Fast, isolated tests using ScalaTest with Mockito for mocking
231+
- **Integration tests** (`app/src/integrationTest`): End-to-end tests for YAML processing, plan execution, and API workflows
232+
- **Performance tests** (`app/src/performanceTest`): Benchmarking for data generation performance and optimization validation
156233
- Test both API builders and core application logic
157-
- Mock external dependencies (databases, file systems)
158-
- Use exact class names for test filtering, not wildcards
159-
- Leverage the example module for integration testing
234+
- Mock external dependencies (databases, file systems) in unit tests
235+
- Use exact class names for test filtering, NOT wildcards
236+
- Leverage the example module for real-world integration scenarios
237+
238+
**Running Tests**:
239+
```bash
240+
# Unit tests only
241+
./gradlew :app:test
242+
243+
# Integration tests
244+
./gradlew :app:integrationTest
245+
246+
# Performance tests
247+
./gradlew :app:performanceTest
248+
249+
# Specific test class
250+
./gradlew :app:test --tests "io.github.datacatering.datacaterer.core.generator.DataGeneratorFactoryTest"
251+
```
160252

161253
## Key Dependencies
162254

163255
- **Scala**: 2.12.x
164-
- **Apache Spark**: 3.5.x
165-
- **Jackson**: JSON serialization
166-
- **Quicklens**: Immutable data updates
167-
- **ScalaTest**: Testing framework
168-
- **HTTP4S**: Web server framework
169-
- **Logback/Log4j**: Logging
256+
- **Apache Spark**: 3.5.x (core data processing engine)
257+
- **Jackson**: JSON/YAML serialization (2.15.3)
258+
- **Quicklens**: Immutable data updates in builders
259+
- **ScalaTest**: Testing framework with JUnit Platform runner
260+
- **Apache Pekko**: Web server framework (HTTP/Actor system)
261+
- **DataFaker**: Data generation library for realistic fake data
262+
- **PureConfig**: Type-safe configuration loading
263+
- **Logback**: Logging framework
264+
- **Various connectors**: Postgres, MySQL, Cassandra, Kafka, BigQuery, Delta Lake, Iceberg, etc.
265+
266+
## Performance Optimization
267+
268+
Data Caterer includes several performance optimizations:
269+
270+
**Fast Generation Mode** (`enableFastGeneration: true`):
271+
- Converts regex patterns to pure SQL expressions (avoiding UDF overhead)
272+
- Dramatically improves generation speed for large datasets
273+
- Automatically falls back to UDF for unsupported patterns
274+
- Recommended for production workloads with regex-based field generation
275+
276+
**Foreign Key Optimization**:
277+
- Efficient foreign key relationship handling for referential integrity
278+
- Optimized sampling and distribution strategies
279+
- Performance testing infrastructure in `app/src/performanceTest`
280+
281+
**Configuration**:
282+
```scala
283+
// In Scala API
284+
config
285+
.generatedReportsFolderPath("/tmp/reports")
286+
.enableFastGeneration(true) // Enable SQL-based regex generation
287+
```
288+
289+
```yaml
290+
# In YAML configuration
291+
flags:
292+
enableFastGeneration: true
293+
```

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ and deep dive into issues [from the generated report](https://data.catering/late
3838

3939
1. Docker
4040
```shell
41-
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.17.0
41+
docker run -d -i -p 9898:9898 -e DEPLOY_MODE=standalone --name datacaterer datacatering/data-caterer:0.17.1
4242
```
4343
[Open localhost:9898](http://localhost:9898).
4444
1. [Run Scala/Java examples](#run-scalajava-examples)

api/src/main/scala/io/github/datacatering/datacaterer/api/ValidationBuilder.scala

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -999,15 +999,13 @@ case class FieldValidationBuilder(validationBuilder: ValidationBuilder = Validat
999999
// If all values are numeric or numeric-looking strings, render as unquoted numbers to form ARRAY<DOUBLE/NUMERIC>
10001000
val allNumericLike = seq.forall {
10011001
case s: String => Try(s.trim.toDouble).isSuccess
1002-
case n: java.lang.Number => true
1003-
case n: Number => true
1002+
case _: java.lang.Number => true
10041003
case _ => false
10051004
}
10061005
if (allNumericLike) {
10071006
seq.map {
10081007
case s: String => BigDecimal(s.trim).toString()
10091008
case n: java.lang.Number => n.toString
1010-
case n: Number => n.toString
10111009
case other => other.toString
10121010
}.mkString(",")
10131011
} else {

api/src/main/scala/io/github/datacatering/datacaterer/api/model/ConfigModels.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ case class FlagsConfig(
1616
enableGenerateValidations: Boolean = DEFAULT_ENABLE_SUGGEST_VALIDATIONS,
1717
enableAlerts: Boolean = DEFAULT_ENABLE_ALERTS,
1818
enableUniqueCheckOnlyInBatch: Boolean = DEFAULT_ENABLE_UNIQUE_CHECK_ONLY_WITHIN_BATCH,
19-
enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION
19+
enableFastGeneration: Boolean = DEFAULT_ENABLE_FAST_GENERATION,
20+
enableForeignKeyV2: Boolean = DEFAULT_ENABLE_FOREIGN_KEY_V2
2021
)
2122

2223
case class FoldersConfig(

0 commit comments

Comments
 (0)