Skip to content

Commit 435fd6f

Browse files
committed
basic docs skeleton
1 parent 4543fe2 commit 435fd6f

8 files changed

Lines changed: 116 additions & 93 deletions

File tree

LICENSE

Whitespace-only changes.

checks.yaml

Lines changed: 9 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,54 +5,34 @@ validations:
55
checks:
66
- id: row_count > 0
77
description: "data is present" # optional
8-
severity: error # optional (error, warn, info), default "error"
8+
on_fail: error # optional (error, warn), default "error"
99

1010
- id: row_count between 100 and 30000
1111
description: "data is not too big"
12-
severity: error
12+
on_fail: error
1313

1414
- id: null_count(pickup_ntaname) == 0
1515
description: "no nulls in column" # optional
16-
severity: error
16+
on_fail: error
1717

1818
- id: min(pickup_datetime) < now() - interval 3 day
1919
description: "min check"
20-
severity: error
20+
on_fail: error
2121

2222
- id: stddevPop(trip_distance) < 100_000
2323
description: "check stddev value"
24-
severity: error
24+
on_fail: error
2525

2626
- id: sum(fare_amount) <= 10_000_000
2727
description: "sum of value"
28-
severity: error
28+
on_fail: error
2929

3030
- id: countIf(trip_id == 1) == 1
3131
description: "check trip id"
32-
severity: warn
32+
on_fail: warn
3333

3434
- id: raw_query
3535
description: "some raw query description here"
36-
severity: error
36+
on_fail: error
3737
query: |
38-
select countIf(trip_distance == 0) > 0 from {{table}} where 1=1
39-
40-
# - dataset: pgsql-staging@[public.table_1, public.table_2]
41-
# checks:
42-
# - id: row_count > 0
43-
# severity: warn
44-
# type:
45-
46-
47-
# v1 supported functions:
48-
# ---
49-
# row_count > 10
50-
# null_count(col) == 0
51-
# avg(col) <= 24.2
52-
# max(col) < 1000
53-
# min(col) == 0
54-
# sum(col) > 0
55-
# stddev(col) between 1 and 100_000_000
56-
# custom
57-
58-
# AI anomaly detection
38+
select countIf(trip_distance == 0) > 0 from {{table}} where 1=1

cmd/check.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ By automating these checks, you can proactively identify and address data qualit
2929
return fmt.Errorf("error while loading checks configuration file: %w", err)
3030
}
3131

32-
shouldFail := false
32+
exitCode := 0
3333
for _, rule := range checksCfg.Validations {
3434
dataSourceId, datasets, err := parseDatasetString(rule.Dataset)
3535
if err != nil {
@@ -50,16 +50,16 @@ By automating these checks, you can proactively identify and address data qualit
5050
}
5151

5252
log.Printf(" [%d/%d] '%s': %s", cIdx+1, len(rule.Checks), check.ID, getCheckResultLabel(pass))
53-
if !pass && check.Severity == "error" {
54-
shouldFail = true
53+
if !pass && internal.IdOrDefault(string(check.OnFail), internal.OnFailActionError) == "error" {
54+
exitCode = 1
5555
}
5656
}
5757
}
5858
}
5959

60-
if shouldFail {
61-
log.Printf("One or more checks with 'error' severity have failed, exiting...")
62-
os.Exit(1)
60+
if exitCode != 0 {
61+
log.Printf("One or more checks with on_fail = 'error' action have failed, exiting.")
62+
os.Exit(exitCode)
6363
}
6464

6565
return nil

cmd/profile.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ func NewProfileCommand(app internal.DbqApp) *cobra.Command {
1313

1414
cmd := &cobra.Command{
1515
Use: "profile",
16-
Short: "Collects dataset's information and generates column statistics",
16+
Short: "Collects dataset`s information and generates column statistics",
1717
Long: `The 'profile' command connects to the specified data source and analyzes a given dataset. It gathers essential information about the table, such as the total number of rows.
1818
Additionally, for each column within the table, it calculates and reports various statistical metrics. These metrics may include the minimum value, maximum value, the count of null or missing values, the data type,
1919
and other relevant statistics depending on the data type and the capabilities of the underlying data source.

cmd/version.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,16 @@ import (
55
"github.com/spf13/cobra"
66
)
77

8+
const (
9+
DbqVersion = "v0.0.3"
10+
)
11+
812
func NewVersionCommand() *cobra.Command {
913
cmd := &cobra.Command{
1014
Use: "version",
1115
Short: "Prints dbq version",
1216
Run: func(cmd *cobra.Command, args []string) {
13-
fmt.Println("DataBridge Quality Core: 0.0.1")
17+
fmt.Printf("DataBridge Quality Core: %s\n", DbqVersion)
1418
},
1519
}
1620

internal/checks_config.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ import (
55
"os"
66
)
77

8+
type OnFailAction string
9+
10+
const (
11+
OnFailActionError = "error"
12+
OnFailActionWarning = "warn"
13+
)
14+
815
type ChecksConfig struct {
916
Version string `yaml:"version"`
1017
Validations []Validation `yaml:"validations"`
@@ -17,10 +24,10 @@ type Validation struct {
1724
}
1825

1926
type Check struct {
20-
ID string `yaml:"id"`
21-
Description string `yaml:"description,omitempty"` // Optional
22-
Severity string `yaml:"severity,omitempty"` // Optional (error, warn, info)
23-
Query string `yaml:"query,omitempty"` // Optional raw query
27+
ID string `yaml:"id"`
28+
Description string `yaml:"description,omitempty"` // optional
29+
OnFail OnFailAction `yaml:"on_fail,omitempty"` // optional (error, warn)
30+
Query string `yaml:"query,omitempty"` // optional raw query
2431
}
2532

2633
func LoadChecksConfig(fileName string) (*ChecksConfig, error) {

internal/utils.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
package internal
2+
3+
func IdOrDefault(original string, defaultVal string) string {
4+
if original == "" {
5+
return defaultVal
6+
}
7+
return original
8+
}

readme.md

Lines changed: 76 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,65 +1,89 @@
11
# DataBridge Quality Core
22

3-
dbq is a data quality tool that provides a set of tools to validate and test data in your data pipeline.
4-
It is designed to be easy to use and integrate into your existing workflow.
3+
`dbq` is a free, open-source data quality CLI checker that provides a set of tools to profile, validate and test data in your data warehouse or databases.
4+
It is designed to be flexible, fast, easy to use and integrate seamlessly into your existing workflow.
55

6-
## Help
7-
- dqb ping cnn-id
8-
- dbq import cnn-id --filter "reporting.*" --cfg checks.yaml --update-cfg
9-
- dbq check --cfg checks.yaml
10-
- dbq --config /Users/artem/code/dbq/dbq.yaml import
11-
- dbq profile --datasource cnn-id --dataset table_name
12-
13-
## 0.1
14-
- [x] basic structure
15-
- [x] define checks cfg v1
16-
- [x] checks cfg parser v1
17-
- [x] complete clickhouse support
18-
- [x] ping
19-
- [x] import datasets
20-
- [x] profile dataset
21-
- [x] rows in table
22-
- [x] min, max, avg, stddev for numeric columns
23-
- [x] count of nulls and blanks
24-
- [x] most frequent value in column
25-
- [x] JSON export
26-
- [x] run checks
27-
- [x] implement support for custom sql check
28-
- [x] implement aliases for common checks based on raw sql check
29-
- [x] fix cmd descriptions
30-
- [x] review todos
31-
- [x] improve output
32-
- [ ] basic cross validation (dataset is defined)
33-
- [ ] review logs
34-
- [ ] review crashes (wrong arguments)
35-
- [ ] default values (e.g. severity)
36-
- [ ] quiet/verbose mode for logs
37-
- [ ] docs
38-
39-
## 0.x
40-
- config validation
41-
- add postgres support
42-
- CLI for adding more checks
43-
- AirFlow integration (operator)
44-
- output format flag
6+
## Features
7+
8+
data profiling
459

10+
v1 supported checks
4611
---
12+
row_count > 10
13+
null_count(col) == 0
14+
avg(col) <= 24.2
15+
max(col) < 1000
16+
min(col) == 0
17+
sum(col) > 0
18+
stddevPop(col) between 1 and 100_000_000
19+
custom
20+
21+
## Supported databases
22+
- [ClickHouse](https://clickhouse.com/)
4723

48-
## Checks config specification
49-
- raw_query(query = "...")
50-
- row_count
51-
- null_count(col)
52-
- <aggr_function> <op> <rest>
24+
## Usage
5325

54-
### clickhouse
26+
### Installation
27+
28+
Download the latest binaries from [GitHub Releases](https://github.com/DataBridge-Tech/dbq/releases).
29+
30+
### Configuration
31+
32+
Create dbq configuration file (default lookup directory is $HOME/.dbq.yaml or ./dbq.yaml). Alternatively,
33+
you can specify configuration during the launch via `--config` parameter:
5534

5635
```bash
57-
docker run -d -p 18123:8123 -p19000:9000 -e CLICKHOUSE_PASSWORD=changeme --name some-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server
36+
dbq --config /path/to/dbq.yaml import
5837
```
5938

60-
# Supported Datasources
61-
- Clickhouse
39+
```yaml
40+
# dbq.yaml
41+
version: "1"
42+
datasources:
43+
- id: clickhouse
44+
type: clickhouse
45+
configuration:
46+
host: 0.0.0.0:19000
47+
port: 19000
48+
username: default
49+
password: changeme
50+
database: default
51+
datasets:
52+
- nyc_taxi.trips_small
53+
```
54+
55+
### Checks example
56+
57+
### Commands
6258
63-
# dbq configuration
59+
```bash
60+
$ dbq help
61+
62+
dbq is a CLI tool for profiling data and running quality checks across various data sources
63+
64+
Usage:
65+
dbq [command]
66+
67+
Available Commands:
68+
check Runs data quality checks defined in a configuration file against a datasource
69+
completion Generate the autocompletion script for the specified shell
70+
help Help about any command
71+
import Connects to a data source and imports all available tables as datasets
72+
ping Checks if the data source is reachable
73+
profile Collects dataset`s information and generates column statistics
74+
version Prints dbq version
75+
76+
Flags:
77+
--config string config file (default is $HOME/.dbq.yaml or ./dbq.yaml)
78+
-h, --help help for dbq
79+
80+
Use "dbq [command] --help" for more information about a command.
6481

65-
# checks configuration
82+
```
83+
84+
### Quick start
85+
- dqb ping cnn-id
86+
- dbq import cnn-id --filter "reporting.*" --cfg checks.yaml --update-cfg
87+
- dbq check --cfg checks.yaml
88+
- dbq --config /Users/artem/code/dbq/dbq.yaml import
89+
- dbq profile --datasource cnn-id --dataset table_name

0 commit comments

Comments
 (0)