configure-generators --spec now allows fallbacks (#62)

tim-band · web-flow · commit c0202fce6a33 · 2025-10-03T12:08:48.000+01:00
* configure-generators --spec now allows fallbacks and multi-column generators
* null-partitioned grouped sampled generators
* SUPPRESS_COUNT is now 7
diff --git a/datafaker/generators.py b/datafaker/generators.py
@@ -944,7 +944,7 @@ class ChoiceGeneratorFactory(GeneratorFactory):
     All generators that want an average and standard deviation.
     """
     SAMPLE_COUNT = MAXIMUM_CHOICES
-    SUPPRESS_COUNT = 5
+    SUPPRESS_COUNT = 7
     def get_generators(self, columns: list[Column], engine: Engine):
         if len(columns) != 1:
             return []
@@ -1511,7 +1511,7 @@ def __init__(
 
 class NullPartitionedNormalGeneratorFactory(MultivariateNormalGeneratorFactory):
     SAMPLE_COUNT = MAXIMUM_CHOICES
-    SUPPRESS_COUNT = 5
+    SUPPRESS_COUNT = 7
 
     def function_name(self) -> str:
         return "grouped_multivariate_normal"
@@ -1567,11 +1567,13 @@ def get_generators(self, columns: list[Column], engine: Engine):
         query_name = f"{table}__{columns[0].name}"
         # Partitions for minimal suppression and no sampling
         row_partitions_maximal: dict[int, RowPartition] = {}
+        # Partitions for minimal suppression but sampling
+        row_partitions_sampled: dict[int, RowPartition] = {}
         # Partitions for normal suppression and severe sampling
         row_partitions_ss: dict[int, RowPartition] = {}
         for partition_nonnulls in powerset(nullable_columns):
             partition_def = NullPatternPartition(columns, partition_nonnulls)
-            query = self.query(
+            query_all = self.query(
                 table=table,
                 columns=partition_def.included_numeric,
                 predicates=partition_def.predicates,
@@ -1580,14 +1582,31 @@ def get_generators(self, columns: list[Column], engine: Engine):
                 constant_clauses=partition_def.constant_clauses,
             )
             row_partitions_maximal[partition_def.index] = RowPartition(
-                query,
+                query_all,
                 partition_def.included_numeric,
                 partition_def.included_choice,
                 partition_def.excluded,
                 partition_def.nones,
                 {},
             )
-            query = self.query(
+            query_sampled = self.query(
+                table=table,
+                columns=partition_def.included_numeric,
+                predicates=partition_def.predicates,
+                group_by_clause=partition_def.group_by_clause,
+                constants = partition_def.constants,
+                constant_clauses=partition_def.constant_clauses,
+                sample_count=self.SAMPLE_COUNT,
+            )
+            row_partitions_sampled[partition_def.index] = RowPartition(
+                query_sampled,
+                partition_def.included_numeric,
+                partition_def.included_choice,
+                partition_def.excluded,
+                partition_def.nones,
+                {},
+            )
+            query_ss = self.query(
                 table=table,
                 columns=partition_def.included_numeric,
                 predicates=partition_def.predicates,
@@ -1598,7 +1617,7 @@ def get_generators(self, columns: list[Column], engine: Engine):
                 sample_count=self.SAMPLE_COUNT,
             )
             row_partitions_ss[partition_def.index] = RowPartition(
-                query,
+                query_ss,
                 partition_def.included_numeric,
                 partition_def.included_choice,
                 partition_def.excluded,
@@ -1622,6 +1641,16 @@ def get_generators(self, columns: list[Column], engine: Engine):
                         partition_counts=partition_count_max_results,
                         partition_count_comment=count_comment,
                     ))
+                if self._execute_partition_queries(connection, row_partitions_sampled):
+                    gens.append(NullPartitionedNormalGenerator(
+                        query_name,
+                        row_partitions_sampled,
+                        self.function_name(),
+                        name_suffix="sampled",
+                        partition_count_query=partition_query_max,
+                        partition_counts=partition_count_max_results,
+                        partition_count_comment=count_comment,
+                    ))
                 partition_query_ss = self.get_partition_count_query(
                     nullable_columns,
                     table,
diff --git a/datafaker/interactive.py b/datafaker/interactive.py
@@ -1,12 +1,14 @@
-import cmd
-import csv
-import functools
-import re
 from abc import ABC, abstractmethod
+import cmd
 from collections.abc import Mapping
+import csv
 from dataclasses import dataclass
 from enum import Enum
+import functools
+import itertools
 from pathlib import Path
+import re
+from typing import Iterable
 
 import sqlalchemy
 from prettytable import PrettyTable
@@ -1518,15 +1520,18 @@ def do_unset(self, _arg):
         self.set_generator(None)
         self._go_next()
 
-    def do_merge(self, arg: str):
-        """ Add this column(s) to the specified column(s), so one generator covers them all. """
+    def merge_columns(self, arg: str) -> bool:
+        """
+        Add this column(s) to the specified column(s), so one generator covers them all.
+        :return: True if everything worked, False if there is an error
+        """
         cols = arg.split()
         if not cols:
             self.print("Error: merge requires a column argument")
         table_entry: GeneratorCmdTableEntry = self.get_table()
         if table_entry is None:
             self.print(self.ERROR_NO_SUCH_TABLE)
-            return
+            return False
         cols_available = functools.reduce(lambda x, y: x | y, [
             frozenset(gen.columns)
             for gen in table_entry.new_generators
@@ -1536,14 +1541,14 @@ def do_merge(self, arg: str):
         if unknown_cols:
             for uc in unknown_cols:
                 self.print(self.ERROR_NO_SUCH_COLUMN, uc)
-            return
+            return False
         gen_info = table_entry.new_generators[self.generator_index]
         current_columns = frozenset(gen_info.columns)
         stated_current_columns = cols_to_merge & current_columns
         if stated_current_columns:
             for c in stated_current_columns:
                 self.print(self.ERROR_COLUMN_ALREADY_MERGED, c)
-            return
+            return False
         # Remove cols_to_merge from each generator
         new_new_generators: list[GeneratorInfo] = []
         for gen in table_entry.new_generators:
@@ -1570,6 +1575,11 @@ def do_merge(self, arg: str):
                     )
         table_entry.new_generators = new_new_generators
         self.set_prompt()
+        return True
+
+    def do_merge(self, arg: str):
+        """ Add this column(s) to the specified column(s), so one generator covers them all. """
+        self.merge_columns(arg)
 
     def complete_merge(self, text: str, _line: str, _begidx: int, _endidx: int):
         last_arg = text.split()[-1]
@@ -1632,6 +1642,35 @@ def complete_unmerge(self, text: str, _line: str, _begidx: int, _endidx: int):
             if column.startswith(last_arg)
         ]
 
+    def get_current_columns(self) -> set[str]:
+        table_entry: GeneratorCmdTableEntry = self.get_table()
+        gen_info = table_entry.new_generators[self.generator_index]
+        return set(gen_info.columns)
+
+    def set_merged_columns(self, first_col: str, other_cols: str) -> bool:
+        """
+        Merge columns, after unmerging everything we don't want
+        :param first_col: The first column we want in the merge, must already
+        be in this column set.
+        :param other_cols: all the columns we want merged other than
+        first_col, in order, space-separated.
+        :return: True if the merge worked, false if there was an error
+        """
+        existing = self.get_current_columns()
+        existing.discard(first_col)
+        for to_remove in existing:
+            self.do_unmerge(to_remove)
+        return self.merge_columns(other_cols)
+
+
+def try_setting_generator(gc: GeneratorCmd, gens: Iterable[str]) -> bool:
+    for gen in gens:
+        new_gen = gc.get_proposed_generator_by_name(gen)
+        if new_gen is not None:
+            gc.set_generator(new_gen)
+            return True
+    return False
+
 
 def update_config_generators(
     src_dsn: str,
@@ -1649,9 +1688,13 @@ def update_config_generators(
         for line in csv.reader(spec):
             line_no += 1
             if line:
-                if len(line) != 3:
-                    logger.error("line {0} of file {1} does not have three values", line_no, spec_path)
-                if gc.go_to(f"{line[0]}.{line[1]}"):
-                    gc.do_set(line[2])
+                if len(line) < 3:
+                    logger.error("line {0} of file {1} has fewer than three values", line_no, spec_path)
+                cols = line[1].split(maxsplit=1)
+                if gc.go_to(f"{line[0]}.{cols[0]}"):
+                    if len(cols) == 1 or gc.set_merged_columns(cols[0], cols[1]):
+                        try_setting_generator(gc, itertools.islice(line, 2, None))
+                else:
+                    logger.warning("no such column {0}[{1}]", line[0], line[1])
         gc.do_quit("yes")
         return gc.config
diff --git a/tests/examples/eav.sql b/tests/examples/eav.sql
@@ -22,8 +22,8 @@ INSERT INTO public.measurement_type VALUES (5, 'matter');
 CREATE TABLE public.measurement (
     id INTEGER NOT NULL,
     type INTEGER NOT NULL,
-    first_value INTEGER,
-    second_value INTEGER,
+    first_value FLOAT,
+    second_value FLOAT,
     third_value TEXT
 );
 
@@ -57,8 +57,8 @@ INSERT INTO public.measurement VALUES (20, 5, 12.4, NULL, 'fowl');
 CREATE TABLE public.observation (
     id INTEGER NOT NULL,
     type INTEGER NOT NULL,
-    first_value INTEGER,
-    second_value INTEGER,
+    first_value FLOAT,
+    second_value FLOAT,
     third_value TEXT
 );
 
diff --git a/tests/test_interactive.py b/tests/test_interactive.py