Skip to content

Commit 00df596

Browse files
author
Tim Band
committed
First test for create-data without intermediate file
1 parent 6cde9fd commit 00df596

14 files changed

Lines changed: 309 additions & 428 deletions

File tree

datafaker/base.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -22,24 +22,6 @@
2222
)
2323

2424

25-
class TableGenerator(ABC):
26-
"""Abstract base class for table generator classes."""
27-
28-
num_rows_per_pass: int = 1
29-
30-
@abstractmethod
31-
def __call__(self, dst_db_conn: Connection, metadata: MetaData) -> dict[str, Any]:
32-
"""Return, as a dictionary, a new row for the table that we are generating.
33-
34-
The only argument, `dst_db_conn`, should be a database connection to the
35-
database to which the data is being written. Most generators won't use it, but
36-
some do, and thus it's required by the interface.
37-
38-
The return value should be a dictionary with column names as strings for keys,
39-
and the values being the values for the new row.
40-
"""
41-
42-
4325
@dataclass
4426
class FileUploader:
4527
"""For uploading data files."""

datafaker/create.py

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
"""Functions and classes to create and populate the target database."""
2-
import pathlib
32
from collections import Counter
4-
from types import ModuleType
3+
from pathlib import Path
54
from typing import Any, Generator, Iterable, Iterator, Mapping, Sequence, Tuple
65

76
from sqlalchemy import Connection, insert, inspect
@@ -10,10 +9,19 @@
109
from sqlalchemy.orm import Session
1110
from sqlalchemy.schema import CreateColumn, CreateSchema, CreateTable, MetaData, Table
1211

13-
from datafaker.base import FileUploader, TableGenerator
12+
from datafaker.base import FileUploader
13+
from datafaker.make import get_generation_info
14+
from datafaker.populate import (
15+
TableGenerator,
16+
get_symbols,
17+
get_table_generator_dict,
18+
get_story_generator_list,
19+
get_vocab_dict,
20+
)
1421
from datafaker.settings import get_destination_dsn, get_destination_schema, get_settings
1522
from datafaker.utils import (
1623
create_db_engine_dst,
24+
get_property,
1725
get_sync_engine,
1826
get_vocabulary_table_names,
1927
logger,
@@ -92,7 +100,7 @@ def create_db_vocab(
92100
metadata: MetaData,
93101
meta_dict: dict[str, Any],
94102
config: Mapping,
95-
base_path: pathlib.Path = pathlib.Path("."),
103+
base_path: Path = Path("."),
96104
) -> list[str]:
97105
"""
98106
Load vocabulary tables from files.
@@ -140,14 +148,16 @@ def create_db_vocab(
140148

141149
def create_db_data(
142150
sorted_tables: Sequence[Table],
143-
df_module: ModuleType,
151+
config: Mapping[str, Any],
152+
src_stats_filename: Path | None,
144153
num_passes: int,
145154
metadata: MetaData,
146155
) -> RowCounts:
147156
"""Connect to a database and populate it with data."""
148157
return create_db_data_into(
149158
sorted_tables,
150-
df_module,
159+
config,
160+
src_stats_filename,
151161
num_passes,
152162
get_destination_dsn(),
153163
get_destination_schema(),
@@ -158,7 +168,8 @@ def create_db_data(
158168
# pylint: disable=too-many-arguments too-many-positional-arguments
159169
def create_db_data_into(
160170
sorted_tables: Sequence[Table],
161-
df_module: ModuleType,
171+
config: Mapping[str, Any],
172+
src_stats_filename: Path | None,
162173
num_passes: int,
163174
db_dsn: str,
164175
schema_name: str | None,
@@ -176,17 +187,31 @@ def create_db_data_into(
176187
:param num_passes: Number of passes to perform.
177188
:param db_dsn: Connection string for the destination database.
178189
:param schema_name: Destination schema name.
190+
:param metadata: Destination database metadata.
179191
"""
180192
dst_engine = get_sync_engine(create_db_engine_dst(db_dsn, schema_name=schema_name))
181-
193+
gen_info = get_generation_info(metadata, config, Path("orm.blah"), Path("config.blah"), src_stats_filename)
182194
row_counts: Counter[str] = Counter()
183195
with dst_engine.connect() as dst_conn:
196+
context = get_symbols(
197+
gen_info.row_generator_module_name,
198+
gen_info.story_generator_module_name,
199+
get_property(config, "object_instantiation", dict, {}),
200+
gen_info.src_stats_filename,
201+
dst_conn,
202+
metadata,
203+
)
184204
for _ in range(num_passes):
185205
row_counts += populate(
186206
dst_conn,
187207
sorted_tables,
188-
df_module.table_generator_dict,
189-
df_module.story_generator_list,
208+
get_table_generator_dict(
209+
dst_conn,
210+
gen_info.tables,
211+
gen_info.max_unique_constraint_tries,
212+
context,
213+
),
214+
get_story_generator_list(gen_info.story_generators, context),
190215
metadata,
191216
)
192217
dst_engine.dispose()
@@ -336,7 +361,7 @@ def populate(
336361
try:
337362
with dst_conn.begin():
338363
for _ in range(table_generator.num_rows_per_pass):
339-
stmt = insert(table).values(table_generator(dst_conn, metadata))
364+
stmt = insert(table).values(table_generator(dst_conn))
340365
dst_conn.execute(stmt)
341366
row_counts[table.name] = row_counts.get(table.name, 0) + 1
342367
dst_conn.commit()

datafaker/main.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,19 @@ def create_data(
147147
help="The name of the ORM yaml file",
148148
dir_okay=False,
149149
),
150-
df_file: str = Option(
151-
DF_FILENAME,
152-
help="The name of the generators file. Must be in the current working directory.",
153-
dir_okay=False,
154-
),
155150
config_file: Optional[Path] = Option(
156151
CONFIG_FILENAME,
157152
help="The configuration file",
158153
),
154+
stats_file: Optional[Path] = Option(
155+
None,
156+
help=(
157+
"Statistics file (output of make-stats); default is src-stats.yaml if the "
158+
"config file references SRC_STATS, or None otherwise."
159+
),
160+
show_default=False,
161+
dir_okay=False,
162+
),
159163
num_passes: int = Option(1, help="Number of passes (rows or stories) to make"),
160164
) -> None:
161165
"""Populate the schema in the target directory with synthetic data.
@@ -179,11 +183,11 @@ def create_data(
179183
logger.debug("Creating data.")
180184
config = read_config_file(config_file) if config_file is not None else {}
181185
orm_metadata = load_metadata_for_output(orm_file, config)
182-
df_module = import_file(df_file)
183186
try:
184187
row_counts = create_db_data(
185188
sorted_non_vocabulary_tables(orm_metadata, config),
186-
df_module,
189+
config,
190+
stats_file,
187191
num_passes,
188192
orm_metadata,
189193
)

datafaker/make.py

Lines changed: 78 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ class FunctionCall:
6666
"""Contains the df.py content related function calls."""
6767

6868
function_name: str
69-
argument_values: list[str]
69+
args: list[str]
70+
kwargs: dict[str, str]
7071

7172

7273
@dataclass
@@ -83,7 +84,8 @@ class ColumnChoice:
8384
"""Choose columns based on a random number in [0,1)."""
8485

8586
function_name: str
86-
argument_values: list[str]
87+
args: list[str]
88+
kwargs: dict[str, str]
8789

8890

8991
def make_column_choices(
@@ -100,7 +102,8 @@ def make_column_choices(
100102
return [
101103
ColumnChoice(
102104
function_name=mg["name"],
103-
argument_values=[f"{k}={v}" for k, v in mg.get("kwargs", {}).items()],
105+
args=mg.get("args", []),
106+
kwargs=mg.get("kwargs", {}),
104107
)
105108
for mg in table_config.get("missingness_generators", [])
106109
if "name" in mg
@@ -168,12 +171,11 @@ def _get_function_call(
168171
if keyword_arguments is None:
169172
keyword_arguments = {}
170173

171-
argument_values: list[str] = [str(value) for value in positional_arguments]
172-
argument_values += [
173-
f"{key}={_render_value(value)}" for key, value in keyword_arguments.items()
174-
]
175-
176-
return FunctionCall(function_name=function_name, argument_values=argument_values)
174+
return FunctionCall(
175+
function_name=function_name,
176+
args=positional_arguments,
177+
kwargs=keyword_arguments,
178+
)
177179

178180

179181
def _get_row_generator(
@@ -580,13 +582,29 @@ def make_vocabulary_tables(
580582
)
581583

582584

583-
def make_table_generators( # pylint: disable=too-many-locals
585+
@dataclass
586+
class GenerationInfo:
587+
"""Information for the generation of all data."""
588+
provider_imports: list[str]
589+
orm_file_name: Path
590+
config_file_name: Path
591+
row_generator_module_name: str | None
592+
story_generator_module_name: str | None
593+
object_instantiation: dict[str, dict]
594+
src_stats_filename: Path | None
595+
tables: list[TableGeneratorInfo]
596+
vocabulary_tables: list[VocabularyTableGeneratorInfo]
597+
story_generators: list[StoryGeneratorInfo]
598+
max_unique_constraint_tries: int | None
599+
600+
601+
def get_generation_info( # pylint: disable=too-many-locals
584602
metadata: MetaData,
585603
config: Mapping,
586604
orm_filename: Path,
587605
config_filename: Path,
588606
src_stats_filename: Optional[Path],
589-
) -> str:
607+
) -> GenerationInfo:
590608
"""
591609
Create datafaker generator classes.
592610
@@ -605,10 +623,16 @@ def make_table_generators( # pylint: disable=too-many-locals
605623
606624
:return: A string that is a valid Python module, once written to file.
607625
"""
608-
row_generator_module_name: str = config.get("row_generators_module", None)
609-
story_generator_module_name = config.get("story_generators_module", None)
610-
object_instantiation: dict[str, dict] = config.get("object_instantiation", {})
611-
tables_config = config.get("tables", {})
626+
row_generator_module_name = get_property(
627+
config, "row_generators_module", str | None, None
628+
)
629+
story_generator_module_name = get_property(
630+
config, "story_generators_module", str | None, None
631+
)
632+
object_instantiation = get_property(
633+
config, "object_instantiation", dict, {}
634+
)
635+
tables_config = get_property(config, "tables", dict, {})
612636

613637
tables: list[TableGeneratorInfo] = []
614638
vocabulary_tables: list[VocabularyTableGeneratorInfo] = []
@@ -637,20 +661,47 @@ def make_table_generators( # pylint: disable=too-many-locals
637661

638662
story_generators = _get_story_generators(config)
639663

640-
max_unique_constraint_tries = config.get("max-unique-constraint-tries", None)
664+
max_unique_constraint_tries = get_property(
665+
config, "max-unique-constraint-tries", str | None, None
666+
)
667+
return GenerationInfo(
668+
provider_imports=PROVIDER_IMPORTS,
669+
orm_file_name=orm_filename,
670+
config_file_name=config_filename,
671+
row_generator_module_name=row_generator_module_name,
672+
story_generator_module_name=story_generator_module_name,
673+
object_instantiation=object_instantiation,
674+
src_stats_filename=src_stats_filename,
675+
tables=tables,
676+
vocabulary_tables=vocabulary_tables,
677+
story_generators=story_generators,
678+
max_unique_constraint_tries=max_unique_constraint_tries,
679+
)
680+
681+
682+
def make_table_generators( # pylint: disable=too-many-locals
683+
metadata: MetaData,
684+
config: Mapping,
685+
orm_filename: Path,
686+
config_filename: Path,
687+
src_stats_filename: Optional[Path],
688+
) -> str:
689+
gi = get_generation_info(
690+
metadata, config, orm_filename, config_filename, src_stats_filename
691+
)
641692
return generate_df_content(
642693
{
643-
"provider_imports": PROVIDER_IMPORTS,
644-
"orm_file_name": orm_filename,
645-
"config_file_name": config_filename,
646-
"row_generator_module_name": row_generator_module_name,
647-
"story_generator_module_name": story_generator_module_name,
648-
"object_instantiation": object_instantiation,
649-
"src_stats_filename": src_stats_filename,
650-
"tables": tables,
651-
"vocabulary_tables": vocabulary_tables,
652-
"story_generators": story_generators,
653-
"max_unique_constraint_tries": max_unique_constraint_tries,
694+
"provider_imports": gi.provider_imports,
695+
"orm_file_name": gi.orm_file_name,
696+
"config_file_name": gi.config_file_name,
697+
"row_generator_module_name": gi.row_generator_module_name,
698+
"story_generator_module_name": gi.story_generator_module_name,
699+
"object_instantiation": gi.object_instantiation,
700+
"src_stats_filename": gi.src_stats_filename,
701+
"tables": gi.tables,
702+
"vocabulary_tables": gi.vocabulary_tables,
703+
"story_generators": gi.story_generators,
704+
"max_unique_constraint_tries": gi.max_unique_constraint_tries,
654705
}
655706
)
656707

0 commit comments

Comments
 (0)