Skip to content

Commit c0202fc

Browse files
authored
configure-generators --spec now allows fallbacks (#62)
* configure-generators --spec now allows fallbacks and multi-column generators * null-partitioned grouped sampled generators * SUPPRESS_COUNT is now 7
1 parent e5769a6 commit c0202fc

4 files changed

Lines changed: 316 additions & 70 deletions

File tree

datafaker/generators.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -944,7 +944,7 @@ class ChoiceGeneratorFactory(GeneratorFactory):
944944
All generators that want an average and standard deviation.
945945
"""
946946
SAMPLE_COUNT = MAXIMUM_CHOICES
947-
SUPPRESS_COUNT = 5
947+
SUPPRESS_COUNT = 7
948948
def get_generators(self, columns: list[Column], engine: Engine):
949949
if len(columns) != 1:
950950
return []
@@ -1511,7 +1511,7 @@ def __init__(
15111511

15121512
class NullPartitionedNormalGeneratorFactory(MultivariateNormalGeneratorFactory):
15131513
SAMPLE_COUNT = MAXIMUM_CHOICES
1514-
SUPPRESS_COUNT = 5
1514+
SUPPRESS_COUNT = 7
15151515

15161516
def function_name(self) -> str:
15171517
return "grouped_multivariate_normal"
@@ -1567,11 +1567,13 @@ def get_generators(self, columns: list[Column], engine: Engine):
15671567
query_name = f"{table}__{columns[0].name}"
15681568
# Partitions for minimal suppression and no sampling
15691569
row_partitions_maximal: dict[int, RowPartition] = {}
1570+
# Partitions for minimal suppression but sampling
1571+
row_partitions_sampled: dict[int, RowPartition] = {}
15701572
# Partitions for normal suppression and severe sampling
15711573
row_partitions_ss: dict[int, RowPartition] = {}
15721574
for partition_nonnulls in powerset(nullable_columns):
15731575
partition_def = NullPatternPartition(columns, partition_nonnulls)
1574-
query = self.query(
1576+
query_all = self.query(
15751577
table=table,
15761578
columns=partition_def.included_numeric,
15771579
predicates=partition_def.predicates,
@@ -1580,14 +1582,31 @@ def get_generators(self, columns: list[Column], engine: Engine):
15801582
constant_clauses=partition_def.constant_clauses,
15811583
)
15821584
row_partitions_maximal[partition_def.index] = RowPartition(
1583-
query,
1585+
query_all,
15841586
partition_def.included_numeric,
15851587
partition_def.included_choice,
15861588
partition_def.excluded,
15871589
partition_def.nones,
15881590
{},
15891591
)
1590-
query = self.query(
1592+
query_sampled = self.query(
1593+
table=table,
1594+
columns=partition_def.included_numeric,
1595+
predicates=partition_def.predicates,
1596+
group_by_clause=partition_def.group_by_clause,
1597+
constants = partition_def.constants,
1598+
constant_clauses=partition_def.constant_clauses,
1599+
sample_count=self.SAMPLE_COUNT,
1600+
)
1601+
row_partitions_sampled[partition_def.index] = RowPartition(
1602+
query_sampled,
1603+
partition_def.included_numeric,
1604+
partition_def.included_choice,
1605+
partition_def.excluded,
1606+
partition_def.nones,
1607+
{},
1608+
)
1609+
query_ss = self.query(
15911610
table=table,
15921611
columns=partition_def.included_numeric,
15931612
predicates=partition_def.predicates,
@@ -1598,7 +1617,7 @@ def get_generators(self, columns: list[Column], engine: Engine):
15981617
sample_count=self.SAMPLE_COUNT,
15991618
)
16001619
row_partitions_ss[partition_def.index] = RowPartition(
1601-
query,
1620+
query_ss,
16021621
partition_def.included_numeric,
16031622
partition_def.included_choice,
16041623
partition_def.excluded,
@@ -1622,6 +1641,16 @@ def get_generators(self, columns: list[Column], engine: Engine):
16221641
partition_counts=partition_count_max_results,
16231642
partition_count_comment=count_comment,
16241643
))
1644+
if self._execute_partition_queries(connection, row_partitions_sampled):
1645+
gens.append(NullPartitionedNormalGenerator(
1646+
query_name,
1647+
row_partitions_sampled,
1648+
self.function_name(),
1649+
name_suffix="sampled",
1650+
partition_count_query=partition_query_max,
1651+
partition_counts=partition_count_max_results,
1652+
partition_count_comment=count_comment,
1653+
))
16251654
partition_query_ss = self.get_partition_count_query(
16261655
nullable_columns,
16271656
table,

datafaker/interactive.py

Lines changed: 56 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
import cmd
2-
import csv
3-
import functools
4-
import re
51
from abc import ABC, abstractmethod
2+
import cmd
63
from collections.abc import Mapping
4+
import csv
75
from dataclasses import dataclass
86
from enum import Enum
7+
import functools
8+
import itertools
99
from pathlib import Path
10+
import re
11+
from typing import Iterable
1012

1113
import sqlalchemy
1214
from prettytable import PrettyTable
@@ -1518,15 +1520,18 @@ def do_unset(self, _arg):
15181520
self.set_generator(None)
15191521
self._go_next()
15201522

1521-
def do_merge(self, arg: str):
1522-
""" Add this column(s) to the specified column(s), so one generator covers them all. """
1523+
def merge_columns(self, arg: str) -> bool:
1524+
"""
1525+
Add this column(s) to the specified column(s), so one generator covers them all.
1526+
:return: True if everything worked, False if there is an error
1527+
"""
15231528
cols = arg.split()
15241529
if not cols:
15251530
self.print("Error: merge requires a column argument")
15261531
table_entry: GeneratorCmdTableEntry = self.get_table()
15271532
if table_entry is None:
15281533
self.print(self.ERROR_NO_SUCH_TABLE)
1529-
return
1534+
return False
15301535
cols_available = functools.reduce(lambda x, y: x | y, [
15311536
frozenset(gen.columns)
15321537
for gen in table_entry.new_generators
@@ -1536,14 +1541,14 @@ def do_merge(self, arg: str):
15361541
if unknown_cols:
15371542
for uc in unknown_cols:
15381543
self.print(self.ERROR_NO_SUCH_COLUMN, uc)
1539-
return
1544+
return False
15401545
gen_info = table_entry.new_generators[self.generator_index]
15411546
current_columns = frozenset(gen_info.columns)
15421547
stated_current_columns = cols_to_merge & current_columns
15431548
if stated_current_columns:
15441549
for c in stated_current_columns:
15451550
self.print(self.ERROR_COLUMN_ALREADY_MERGED, c)
1546-
return
1551+
return False
15471552
# Remove cols_to_merge from each generator
15481553
new_new_generators: list[GeneratorInfo] = []
15491554
for gen in table_entry.new_generators:
@@ -1570,6 +1575,11 @@ def do_merge(self, arg: str):
15701575
)
15711576
table_entry.new_generators = new_new_generators
15721577
self.set_prompt()
1578+
return True
1579+
1580+
def do_merge(self, arg: str):
1581+
""" Add this column(s) to the specified column(s), so one generator covers them all. """
1582+
self.merge_columns(arg)
15731583

15741584
def complete_merge(self, text: str, _line: str, _begidx: int, _endidx: int):
15751585
last_arg = text.split()[-1]
@@ -1632,6 +1642,35 @@ def complete_unmerge(self, text: str, _line: str, _begidx: int, _endidx: int):
16321642
if column.startswith(last_arg)
16331643
]
16341644

1645+
def get_current_columns(self) -> set[str]:
1646+
table_entry: GeneratorCmdTableEntry = self.get_table()
1647+
gen_info = table_entry.new_generators[self.generator_index]
1648+
return set(gen_info.columns)
1649+
1650+
def set_merged_columns(self, first_col: str, other_cols: str) -> bool:
1651+
"""
1652+
Merge columns, after unmerging everything we don't want
1653+
:param first_col: The first column we want in the merge, must already
1654+
be in this column set.
1655+
:param other_cols: all the columns we want merged other than
1656+
first_col, in order, space-separated.
1657+
:return: True if the merge worked, false if there was an error
1658+
"""
1659+
existing = self.get_current_columns()
1660+
existing.discard(first_col)
1661+
for to_remove in existing:
1662+
self.do_unmerge(to_remove)
1663+
return self.merge_columns(other_cols)
1664+
1665+
1666+
def try_setting_generator(gc: GeneratorCmd, gens: Iterable[str]) -> bool:
1667+
for gen in gens:
1668+
new_gen = gc.get_proposed_generator_by_name(gen)
1669+
if new_gen is not None:
1670+
gc.set_generator(new_gen)
1671+
return True
1672+
return False
1673+
16351674

16361675
def update_config_generators(
16371676
src_dsn: str,
@@ -1649,9 +1688,13 @@ def update_config_generators(
16491688
for line in csv.reader(spec):
16501689
line_no += 1
16511690
if line:
1652-
if len(line) != 3:
1653-
logger.error("line {0} of file {1} does not have three values", line_no, spec_path)
1654-
if gc.go_to(f"{line[0]}.{line[1]}"):
1655-
gc.do_set(line[2])
1691+
if len(line) < 3:
1692+
logger.error("line {0} of file {1} has fewer than three values", line_no, spec_path)
1693+
cols = line[1].split(maxsplit=1)
1694+
if gc.go_to(f"{line[0]}.{cols[0]}"):
1695+
if len(cols) == 1 or gc.set_merged_columns(cols[0], cols[1]):
1696+
try_setting_generator(gc, itertools.islice(line, 2, None))
1697+
else:
1698+
logger.warning("no such column {0}[{1}]", line[0], line[1])
16561699
gc.do_quit("yes")
16571700
return gc.config

tests/examples/eav.sql

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ INSERT INTO public.measurement_type VALUES (5, 'matter');
2222
CREATE TABLE public.measurement (
2323
id INTEGER NOT NULL,
2424
type INTEGER NOT NULL,
25-
first_value INTEGER,
26-
second_value INTEGER,
25+
first_value FLOAT,
26+
second_value FLOAT,
2727
third_value TEXT
2828
);
2929

@@ -57,8 +57,8 @@ INSERT INTO public.measurement VALUES (20, 5, 12.4, NULL, 'fowl');
5757
CREATE TABLE public.observation (
5858
id INTEGER NOT NULL,
5959
type INTEGER NOT NULL,
60-
first_value INTEGER,
61-
second_value INTEGER,
60+
first_value FLOAT,
61+
second_value FLOAT,
6262
third_value TEXT
6363
);
6464

0 commit comments

Comments
 (0)