fix bugs for adult dataset

antheas · antheas · commit a47b3a8ac0b7 · 2023-08-12T18:05:45.000Z
diff --git a/src/pasteur/attribute.py b/src/pasteur/attribute.py
@@ -191,7 +191,7 @@ def from_str(
 
 class Value:
     """ Base value class """
-    name: str | None = None
+    name: str | tuple[str] | None = None
     common: int = 0
 
 
diff --git a/src/pasteur/encode.py b/src/pasteur/encode.py
@@ -8,24 +8,24 @@
 from .module import ModuleClass, ModuleFactory
 from .utils import LazyFrame, LazyDataset
 
-ENC = TypeVar("ENC", bound=ModuleClass)
-META = TypeVar("META")
-
 
-class AttributeEncoderFactory(ModuleFactory[ENC], Generic[ENC]):
+class AttributeEncoderFactory(ModuleFactory):
     """Factory base class for encoders. Use isinstance with this class
     to filter the Pasteur module list into only containing Encoders."""
 
     ...
 
 
-class EncoderFactory(ModuleFactory[ENC], Generic[ENC]):
+class EncoderFactory(ModuleFactory):
     """Factory base class for encoders. Use isinstance with this class
     to filter the Pasteur module list into only containing Encoders."""
 
     ...
 
 
+META = TypeVar("META")
+
+
 class AttributeEncoder(ModuleClass, Generic[META]):
     """Encapsulates a special way to encode an Attribute.
 
@@ -50,7 +50,7 @@ class AttributeEncoder(ModuleClass, Generic[META]):
     """
 
     name: str = ""
-    _factory = AttributeEncoderFactory["AttributeEncoder"]
+    _factory = AttributeEncoderFactory
 
     def fit(self, attr: Attribute, data: pd.DataFrame | None):
         raise NotImplementedError()
@@ -70,7 +70,7 @@ def get_metadata(self) -> dict[str | tuple[str], META]:
 
 class Encoder(ModuleClass, Generic[META]):
     name: str = ""
-    _factory = EncoderFactory["Encoder"]
+    _factory = EncoderFactory
 
     def fit(
         self,
diff --git a/src/pasteur/extras/metrics/distr.py b/src/pasteur/extras/metrics/distr.py
@@ -68,13 +68,15 @@ def _visualise_cs(
     results = {}
 
     # Add ref split first
+    zfill = lambda x: (x + 1) / np.sum(x + 1)
     name = "ref"
     res = []
     split = next(iter(data.values()))
     for col in domain:
         wrk, syn = split.wrk, split.ref
         assert syn is not None
-        chi, p = chisquare(wrk[col], syn[col])
+
+        chi, p = chisquare(zfill(wrk[col]), zfill(syn[col]))
         res.append([col, chi, p])
 
     results[name] = pd.DataFrame(res, columns=["col", "X^2", "p"])
@@ -84,7 +86,7 @@ def _visualise_cs(
         for col in domain:
             wrk, syn = split.wrk, split.syn
             assert syn is not None
-            chi, p = chisquare(wrk[col], syn[col])
+            chi, p = chisquare(zfill(wrk[col]), zfill(syn[col]))
             res.append([col, chi, p])
 
         results[name] = pd.DataFrame(res, columns=["col", "X^2", "p"])
@@ -125,8 +127,10 @@ def _visualise_kl(
         res = []
         for key in syn:
             col_i, col_j = key
-            k = wrk[key]
-            j = syn[key]
+
+            zfill = lambda x: (x + KL_ZERO_FILL) / np.sum(x + KL_ZERO_FILL)
+            k = zfill(wrk[key])
+            j = zfill(syn[key])
 
             kl = rel_entr(k, j).sum()
             kl_norm = 1 / (1 + kl)
@@ -168,19 +172,19 @@ def _process_marginals_chunk(
 ):
     assert not expand_parents, "Expanding parents not supported yet"
 
-    table = tables[name]()[list(domain)].to_numpy(dtype="uint16")
+    table = tables[name]()[list(domain[name])].to_numpy(dtype="uint16")
     table_domain = domain[name]
     domain_arr = np.array(list(table_domain.values()))
 
     # One way for CS
     one_way: dict[str, ndarray] = {}
-    for i, name in enumerate(domain):
+    for i, name in enumerate(table_domain):
         one_way[name] = calc_marginal_1way(table, domain_arr, [i], 0)
 
     # Two way for KL
     two_way: dict[tuple[str, str], ndarray] = {}
-    for i, col_i in enumerate(domain):
-        for j, col_j in enumerate(domain):
+    for i, col_i in enumerate(table_domain):
+        for j, col_j in enumerate(table_domain):
             two_way[(col_i, col_j)] = calc_marginal_1way(table, domain_arr, [i, j], 0)
 
     return one_way, two_way
@@ -308,7 +312,7 @@ def process(
         # Intertwine results
         res = defaultdict(list)
         for meta, hist in zip(per_call_meta, out):
-            res[meta["split"]][meta["table"]].append(hist)
+            res[meta["table"]].append(hist)
 
         ret = {}
         for table, table_hists in res.items():
diff --git a/src/pasteur/extras/synth/pgm/aim.py b/src/pasteur/extras/synth/pgm/aim.py
@@ -55,6 +55,8 @@ def __init__(
     def preprocess(self, meta: dict[str, Attributes], data: dict[str, LazyFrame]):
         self.table = next(iter(meta))
         self.attrs = meta
+        self._n = data[self.table].shape[0]
+        self._partitions = len(data[self.table])
 
     @make_deterministic
     def bake(self, data: dict[str, LazyFrame]):
diff --git a/src/pasteur/extras/synth/pgm/mst.py b/src/pasteur/extras/synth/pgm/mst.py
@@ -89,6 +89,8 @@ def __init__(
     def preprocess(self, meta: dict[str, Attributes], data: dict[str, LazyFrame]):
         self.table = next(iter(meta))
         self.attrs = meta
+        self._n = data[self.table].shape[0]
+        self._partitions = len(data[self.table])
 
     @make_deterministic
     def bake(self, data: dict[str, LazyFrame]):
diff --git a/src/pasteur/extras/synth/privbayes/__init__.py b/src/pasteur/extras/synth/privbayes/__init__.py
@@ -66,6 +66,9 @@ def preprocess(self, meta: dict[str, Attributes], data: dict[str, LazyFrame]):
         table = tables[table_name]
         table_attrs = attrs[table_name]
 
+        self._n = table.shape[0]
+        self._partitions = len(table)
+
         if self.rebalance:
             with MarginalOracle(
                 table_attrs,
diff --git a/src/pasteur/extras/transformers.py b/src/pasteur/extras/transformers.py
@@ -52,7 +52,9 @@ def fit(self, data: pd.Series):
         if self.max is None and self.find_edges:
             self.max = data.max()
         self.attr = NumAttribute(self.col, self.bins, self.min, self.max, self.nullable)
-        return self.attr
+    
+    def get_attributes(self) -> Attributes:
+        return {self.attr.name: self.attr}
 
     def transform(self, data: pd.Series) -> pd.DataFrame:
         return pd.DataFrame(data.clip(self.min, self.max).astype("float32"))
diff --git a/src/pasteur/kedro/dataset/auto.py b/src/pasteur/kedro/dataset/auto.py
@@ -112,7 +112,7 @@ def _save_worker(
             w.write(pa.Table.from_pandas(p0, schema=schema))
             del p0
 
-            for p in chunk: # type: ignore
+            for p in chunk:  # type: ignore
                 try:
                     w.write(pa.Table.from_pandas(p, schema=schema))
                 except Exception as e:
@@ -208,9 +208,9 @@ def _load_shape_worker(load_path: str, filesystem, *_, **__):
 
 class AutoDataset(AbstractVersionedDataSet[pd.DataFrame, pd.DataFrame]):
     """Modified kedro parquet dataset that acts similarly to a partitioned dataset
-    and implements lazy loading. 
-    
-    In the future, this dataset will automatically handle pickling, pyarrow 
+    and implements lazy loading.
+
+    In the future, this dataset will automatically handle pickling, pyarrow
     Tables, DataFrames, and Tensors automatically based on what is saved.
 
     `save()` data can be a table, a callable, or a dictionary combination of both.
@@ -403,4 +403,4 @@ def _save(self, data: pd.DataFrame) -> None:
     def reset(self):
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
         if self._fs.exists(save_path):
-            self._fs.rm(save_path, recursive=True, maxdepth=1)
+            self._fs.rm(save_path, recursive=True, maxdepth=1)
diff --git a/src/pasteur/kedro/dataset/multi.py b/src/pasteur/kedro/dataset/multi.py
@@ -1,10 +1,152 @@
-from kedro.io.partitioned_dataset import PartitionedDataSet
+import warnings
+from copy import deepcopy
+from typing import Any, Callable
 
+from kedro.io.core import (
+    VERSION_KEY,
+    VERSIONED_FLAG_KEY,
+    AbstractDataSet,
+    DatasetError,
+    parse_dataset_definition,
+)
+from kedro.io.partitioned_dataset import S3_PROTOCOLS
 
-class Multiset(PartitionedDataSet):
-    """Modified Partitioned Dataset for pasteur."""
+
+from urllib.parse import urlparse
+class Multiset(AbstractDataSet):
+    # noqa: too-many-instance-attributes,protected-access
+    """Simplified version of the partitioned dataset. Is not lazy."""
+
+    def __init__(  # noqa: too-many-arguments
+        self,
+        path: str,
+        dataset: str | type[AbstractDataSet] | dict[str, Any],
+        filepath_arg: str = "filepath",
+        filename_suffix: str = "",
+        credentials: dict[str, Any] | None = None,
+        load_args: dict[str, Any] | None = None,
+        metadata: dict[str, Any] | None = None,
+    ):
+        # noqa: import-outside-toplevel
+        from fsspec.utils import infer_storage_options  # for performance reasons
+
+        super().__init__()
+
+        self._path = path
+        self._filename_suffix = filename_suffix
+        self._protocol = infer_storage_options(self._path)["protocol"]
+        self.metadata = metadata
+
+        dataset = dataset if isinstance(dataset, dict) else {"type": dataset}
+        self._dataset_type, self._dataset_config = parse_dataset_definition(dataset)
+        if VERSION_KEY in self._dataset_config:
+            raise DatasetError(
+                f"'{self.__class__.__name__}' does not support versioning of the "
+                f"underlying dataset. Please remove '{VERSIONED_FLAG_KEY}' flag from "
+                f"the dataset definition."
+            )
+
+        self._credentials = deepcopy(credentials) or {}
+        self._filepath_arg = filepath_arg
+        if self._filepath_arg in self._dataset_config:
+            warnings.warn(
+                f"'{self._filepath_arg}' key must not be specified in the dataset "
+                f"definition as it will be overwritten by partition path"
+            )
+
+        self._load_args = deepcopy(load_args) or {}
+        self._sep = self._filesystem.sep
+        # since some filesystem implementations may implement a global cache
+        self._invalidate_caches()
+
+    @property
+    def _filesystem(self):
+        # for performance reasons
+        import fsspec  # noqa: import-outside-toplevel
+
+        protocol = "s3" if self._protocol in S3_PROTOCOLS else self._protocol
+        return fsspec.filesystem(protocol, **self._credentials)
+
+    @property
+    def _normalized_path(self) -> str:
+        if self._protocol in S3_PROTOCOLS:
+            return urlparse(self._path)._replace(scheme="s3").geturl()
+        return self._path
+
+    def _list_partitions(self) -> list[str]:
+        return [
+            path
+            for path in self._filesystem.find(self._normalized_path, **self._load_args)
+            if path.endswith(self._filename_suffix)
+        ]
+
+    def _join_protocol(self, path: str) -> str:
+        protocol_prefix = f"{self._protocol}://"
+        if self._path.startswith(protocol_prefix) and not path.startswith(
+            protocol_prefix
+        ):
+            return f"{protocol_prefix}{path}"
+        return path
+
+    def _partition_to_path(self, path: str):
+        dir_path = self._path.rstrip(self._sep)
+        path = path.lstrip(self._sep)
+        full_path = self._sep.join([dir_path, path]) + self._filename_suffix
+        return full_path
+
+    def _path_to_partition(self, path: str) -> str:
+        dir_path = self._filesystem._strip_protocol(self._normalized_path)
+        path = path.split(dir_path, 1).pop().lstrip(self._sep)
+        if self._filename_suffix and path.endswith(self._filename_suffix):
+            path = path[: -len(self._filename_suffix)]
+        return path
+
+    def _load(self) -> dict[str, Callable[[], Any]]:
+        partitions = {}
+
+        for partition in self._list_partitions():
+            kwargs = deepcopy(self._dataset_config)
+            # join the protocol back since PySpark may rely on it
+            kwargs[self._filepath_arg] = self._join_protocol(partition)
+            dataset = self._dataset_type(**kwargs)  # type: ignore
+            partition_id = self._path_to_partition(partition)
+            partitions[partition_id] = dataset.load()
+
+        return partitions
+
+    def _save(self, data: dict[str, Any]) -> None:
+        for partition_id, partition_data in sorted(data.items()):
+            kwargs = deepcopy(self._dataset_config)
+            partition = self._partition_to_path(partition_id)
+            # join the protocol back since tools like PySpark may rely on it
+            kwargs[self._filepath_arg] = self._join_protocol(partition)
+            dataset = self._dataset_type(**kwargs)  # type: ignore
+            if callable(partition_data):
+                partition_data = partition_data()  # noqa: redefined-loop-name
+            dataset.save(partition_data)
+
+        self._invalidate_caches()
+
+    def _describe(self) -> dict[str, Any]:
+        clean_dataset_config = (
+            {k: v for k, v in self._dataset_config.items()}
+            if isinstance(self._dataset_config, dict)
+            else self._dataset_config
+        )
+        return {
+            "path": self._path,
+            "dataset_type": self._dataset_type.__name__,
+            "dataset_config": clean_dataset_config,
+        }
+
+    def _invalidate_caches(self):
+        self._filesystem.invalidate_cache(self._normalized_path)
 
     def reset(self):
         """Removes the dataset from disk so that there are no stray partitions in subsequent runs."""
         if self._filesystem.exists(self._normalized_path):
             self._filesystem.rm(self._normalized_path, recursive=True, maxdepth=1)
+
+    def _release(self) -> None:
+        super()._release()
+        self._invalidate_caches()
diff --git a/src/pasteur/kedro/pipelines/meta.py b/src/pasteur/kedro/pipelines/meta.py
@@ -74,11 +74,13 @@ def _flatten_outputs(
     if isinstance(nested, dict):
         assert isinstance(outputs, dict)
         for idx, vals in nested.items():
-            assert idx in outputs
-            data = _flatten_outputs(vals, outputs[idx])
-            out.update(data)
+            if idx in outputs:
+                data = _flatten_outputs(vals, outputs[idx])
+                out.update(data)
     else:
-        assert isinstance(outputs, list) and isinstance(nested, list)
+        assert (isinstance(outputs, list) or isinstance(outputs, tuple)) and (
+            isinstance(nested, list) or isinstance(nested, tuple)
+        )
         assert len(outputs) == len(nested)
         for vals, outs in zip(nested, outputs):
             data = _flatten_outputs(vals, outs)
@@ -323,6 +325,7 @@ def node(
         namespace=namespace,
     )
 
+
 # Tag each node in the pipeline based on its use
 TAG_VIEW = "view"
 TAG_DATASET = "dataset"
diff --git a/src/pasteur/kedro/pipelines/metrics.py b/src/pasteur/kedro/pipelines/metrics.py
@@ -173,7 +173,7 @@ def _create_process_pipeline(
             D(
                 "measure",
                 f"{view}.{syn_split}.{name}_data",
-                ['synth', view, syn_split, 'msr', name],
+                ['synth', view, syn_split, 'msr', name, "pre"],
                 type="pkl",
                 versioned=True,
             ),
diff --git a/src/pasteur/kedro/pipelines/synth.py b/src/pasteur/kedro/pipelines/synth.py
@@ -29,6 +29,7 @@ def create_synth_pipeline(
                 name=f"fitting_{fr.name}",
                 args=[fr],
                 inputs={
+                    "metadata": f"{view}.metadata",
                     "encoder": f"{view}.enc.{fr.type}",
                     "data": f"{view}.{split}.{fr.type}",
                 },
diff --git a/src/pasteur/kedro/pipelines/transform.py b/src/pasteur/kedro/pipelines/transform.py
diff --git a/src/pasteur/kedro/pipelines/views.py b/src/pasteur/kedro/pipelines/views.py
diff --git a/src/pasteur/metric.py b/src/pasteur/metric.py
diff --git a/src/pasteur/synth.py b/src/pasteur/synth.py
diff --git a/src/pasteur/table.py b/src/pasteur/table.py
diff --git a/src/pasteur/utils/data.py b/src/pasteur/utils/data.py