fix refactor introduced errors

antheas · antheas · commit 9144c5e3510b · 2023-03-18T11:50:21.000Z
diff --git a/src/pasteur/attribute.py b/src/pasteur/attribute.py
@@ -262,6 +262,7 @@ def upsample(self, value: np.ndarray, height: int, deterministic: bool = True):
     def select_height(self) -> int:
         return 0
 
+IdxValue = CatValue
 
 class StratifiedValue(CatValue):
     """A version of CategoricalValue which uses a Stratification to represent 
diff --git a/src/pasteur/dataset.py b/src/pasteur/dataset.py
@@ -102,7 +102,7 @@ def ingest(self, name, **tables: Any) -> LazyFrame:
 
         @warning: all partitioned tables should have the same partitions.
         Some tables may not be partitioned.
-
+        
         Tip: use a `match` statement to fork based on table name to per-table functions."""
         raise NotImplemented()
 
@@ -149,5 +149,18 @@ def keys(self, **tables: LazyChunk) -> pd.DataFrame:
 
         return tables["table"]()
 
+class TypedDataset(Dataset):
+    """Extend from to create an intermediary step in ingestion, where the table
+    is loaded from `<dataset>.raw@<table>` to a parquet one `<dataset>.typed.<table>.
+
+    Useful for multiple reads to raw tables. You can also override the `type()` function to make
+    minor changes to the dataset. By default it's the identity.
+
+    Since parquet files don't support chunked loading it's unused."""
+
+    def type(self, table: Any):
+        return table
+
+
 
 __all__ = ["Dataset", "TabularDataset"]
diff --git a/src/pasteur/extras/__init__.py b/src/pasteur/extras/__init__.py
@@ -1,3 +1,6 @@
+""" This package contains reference implementations for Pasteur modules, which
+may be extracted to a separate package in the future."""
+
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
diff --git a/src/pasteur/extras/encoders.py b/src/pasteur/extras/encoders.py
@@ -1,19 +1,26 @@
 from copy import copy
+from typing import cast
 
 import numpy as np
 import pandas as pd
 
-from ..attribute import Attribute, IdxValue, NumValue, OrdValue, get_dtype
+from ..attribute import (
+    Attribute,
+    CatValue,
+    NumValue,
+    _create_strat_value_ord,
+    get_dtype,
+)
 from ..encode import Encoder
 
 
 class DiscretizationColumnTransformer:
     """Converts a numerical column into an ordinal one using histograms."""
 
-    def fit(self, attr: NumValue, data: pd.Series) -> IdxValue:
+    def fit(self, attr: NumValue, data: pd.Series) -> CatValue:
         self.in_attr = attr
         assert data.name
-        self.col = data.name
+        self.col = cast(str, data.name)
 
         rng = (
             (attr.min, attr.max)
@@ -26,7 +33,7 @@ def fit(self, attr: NumValue, data: pd.Series) -> IdxValue:
         self.vals = ((self.edges[:-1] + self.edges[1:]) / 2).astype(np.float32)
 
         if attr.common <= 1:
-            self.attr = OrdValue(self.vals, na=attr.common == 1)
+            self.attr = _create_strat_value_ord(self.vals, na=attr.common == 1)
         else:
             assert (
                 False
@@ -117,7 +124,7 @@ def fit(self, attr: Attribute, data: pd.DataFrame) -> Attribute:
         skip_common = False
         if len(attr.vals) == 1:
             v = next(iter(attr.vals.values()))
-            if isinstance(v, IdxValue) and v.is_ordinal:
+            if isinstance(v, CatValue) and v.is_ordinal:
                 skip_common = True
 
         if not skip_common:
@@ -127,7 +134,7 @@ def fit(self, attr: Attribute, data: pd.DataFrame) -> Attribute:
         for name, col in attr.vals.items():
             if isinstance(col, NumValue):
                 cols[name] = col
-            elif isinstance(col, IdxValue):
+            elif isinstance(col, CatValue):
                 if col.is_ordinal():
                     cols[name] = NumValue()
                 else:
@@ -150,14 +157,14 @@ def encode(self, data: pd.DataFrame) -> pd.DataFrame:
         skip_common = False
         if len(a.vals) == 1:
             v = next(iter(a.vals.values()))
-            if isinstance(v, IdxValue) and v.is_ordinal:
+            if isinstance(v, CatValue) and v.is_ordinal:
                 skip_common = True
 
         for i in range(a.common) if not skip_common else []:
             cmn_col = pd.Series(False, index=data.index, name=f"{a.name}_cmn_{i}", dtype=np.float32)
 
             for name, col in a.vals.items():
-                if isinstance(col, IdxValue):
+                if isinstance(col, CatValue):
                     cmn_col += data[name] == i
                 elif isinstance(col, NumValue) and only_has_na:
                     # Numerical values are expected to be NA for all common values
@@ -170,7 +177,7 @@ def encode(self, data: pd.DataFrame) -> pd.DataFrame:
         for name, col in a.vals.items():
             if isinstance(col, NumValue):
                 cols.append(data[name])
-            elif isinstance(col, IdxValue):
+            elif isinstance(col, CatValue):
                 # TODO add proper encodings other than one hot
 
                 # Handle ordinal values
diff --git a/src/pasteur/extras/metrics/distr.py b/src/pasteur/extras/metrics/distr.py
@@ -9,7 +9,7 @@
 from scipy.special import rel_entr
 from scipy.stats import chisquare
 
-from ...attribute import Attributes, IdxValue, get_dtype
+from ...attribute import Attributes, CatValue, get_dtype
 from ...metric import Summaries, TableData, TableMetric
 from ...utils.progress import process_in_parallel
 
@@ -70,7 +70,7 @@ def fit(
         self.domain = {}
         for attr in table_attrs.values():
             for name, val in attr.vals.items():
-                assert isinstance(val, IdxValue)
+                assert isinstance(val, CatValue)
                 self.domain[name] = val.domain
 
     def process_chunk(
@@ -187,7 +187,7 @@ def fit(
         self.domain = {}
         for attr in table_attrs.values():
             for name, val in attr.vals.items():
-                assert isinstance(val, IdxValue)
+                assert isinstance(val, CatValue)
                 self.domain[name] = val.domain
 
     def process_chunk(
diff --git a/src/pasteur/extras/synth/privbayes/implementation.py b/src/pasteur/extras/synth/privbayes/implementation.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 
-from ....attribute import Attributes, IdxValue, get_dtype
+from ....attribute import Attributes, CatValue, get_dtype
 from ....marginal import (
     ZERO_FILL,
     AttrSelector,
@@ -230,7 +230,7 @@ def greedy_bayes(
     for i, (an, a) in enumerate(attrs.items()):
         group_names.append(an)
         for c_n, c in a.vals.items():
-            c = cast(IdxValue, c)
+            c = cast(CatValue, c)
             col_names.append(c_n)
             groups.append(i)
             heights.append(c.height)
@@ -245,7 +245,7 @@ def greedy_bayes(
 
         for i, (an, a) in enumerate(attrs.items()):
             for c_n, c in a.vals.items():
-                c = cast(IdxValue, c)
+                c = cast(CatValue, c)
 
                 doms = []
                 for i in range(c.height):
@@ -660,7 +660,7 @@ def sample_rows(
                 p_partial = partial and attr_name == x_attr
                 for i, (col_name, h) in enumerate(attr.cols.items()):
                     col = attrs[attr_name].vals[col_name]
-                    col = cast(IdxValue, col)
+                    col = cast(CatValue, col)
                     mapping = np.array(col.get_mapping(h), dtype=dtype)
                     domain = col.get_domain(h)
 
diff --git a/src/pasteur/extras/transformers.py b/src/pasteur/extras/transformers.py
@@ -7,12 +7,12 @@
 from ..attribute import (
     Attribute,
     CatAttribute,
-    Level,
-    LevelValue,
+    Grouping,
+    CatValue,
     NumAttribute,
     NumValue,
     OrdAttribute,
-    OrdValue,
+    _create_strat_value_ord as OrdValue,
     get_dtype,
 )
 from ..transform import RefTransformer, Transformer
@@ -443,7 +443,7 @@ def fit(
                 hours.append(f"{hour:02d}:00")
             elif span == "halfhour":
                 hours.append(
-                    Level(
+                    Grouping(
                         "ord",
                         [f"{hour:02d}:00", f"{hour:02d}:30"],
                     )
@@ -455,7 +455,7 @@ def fit(
                         mins.append(f"{hour:02d}:{min:02d}")
                     if span == "halfminute":
                         mins.append(
-                            Level(
+                            Grouping(
                                 "ord",
                                 [
                                     f"{hour:02d}:{min:02d}:00",
@@ -467,17 +467,17 @@ def fit(
                         secs = []
                         for sec in range(60):
                             secs.append(f"{hour:02d}:{min:02d}:{sec:02d}")
-                        mins.append(Level("ord", secs))
+                        mins.append(Grouping("ord", secs))
 
-                hours.append(Level("ord", mins))
-        lvl = Level("ord", hours)
+                hours.append(Grouping("ord", mins))
+        lvl = Grouping("ord", hours)
         if self.nullable:
-            lvl = Level("cat", [None, lvl])
+            lvl = Grouping("cat", [None, lvl])
 
         self.domain = lvl.size
 
         self.attr = Attribute(
-            cast(str, data.name), {f"{data.name}_time": LevelValue(lvl)}, self.nullable
+            cast(str, data.name), {f"{data.name}_time": CatValue(lvl)}, self.nullable
         )
         return self.attr
 
diff --git a/src/pasteur/kedro/pipelines/synth.py b/src/pasteur/kedro/pipelines/synth.py
@@ -25,8 +25,8 @@ def create_synth_pipeline(
     tables = view.tables
 
     tags: list[str] = list(TAGS_SYNTH)
-    if fr.gpu:
-        tags.append(TAG_GPU)
+    # if fr.gpu:
+    #     tags.append(TAG_GPU)
 
     pipe = pipeline(
         [
diff --git a/src/pasteur/marginal/memory.py b/src/pasteur/marginal/memory.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from ..attribute import Attributes, get_dtype, IdxValue
+from ..attribute import Attributes, get_dtype, CatValue
 from ..utils import LazyFrame
 
 class ArrayInfo(NamedTuple):
@@ -36,7 +36,7 @@ def allocate_memory(data: LazyFrame, attrs: Attributes, *, common: bool = False)
             continue
 
         for name, col in attr.vals.items():
-            col = cast(IdxValue, col)
+            col = cast(CatValue, col)
             info[name] = []
             for height in range(col.height):
                 shape = (n, )
diff --git a/src/pasteur/marginal/numpy.py b/src/pasteur/marginal/numpy.py
@@ -3,7 +3,7 @@
 import numpy as np
 import pandas as pd
 
-from ..attribute import Attributes, get_dtype, IdxValue
+from ..attribute import Attributes, get_dtype, CatValue
 
 ZERO_FILL = 1e-24
 
@@ -63,7 +63,7 @@ def expand_table(
             if name not in table:
                 continue
 
-            col = cast(IdxValue, col)
+            col = cast(CatValue, col)
             col_hier = []
             col_noncommon = []
             col_dom = []
@@ -98,7 +98,7 @@ def get_domains(attrs: Attributes) -> dict[str, list[int]]:
     domains = {}
     for attr in attrs.values():
         for name, col in attr.vals.items():
-            col = cast(IdxValue, col)
+            col = cast(CatValue, col)
             col_dom = []
 
             for height in range(col.height):
diff --git a/src/pasteur/utils/__init__.py b/src/pasteur/utils/__init__.py
@@ -6,4 +6,6 @@
     gen_closure,
     to_chunked,
     apply_fun,
+    list_unique,
+    get_relative_fn
 )

Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@ def create_synth_pipeline(`
`25`	`25`	`tables = view.tables`
`26`	`26`
`27`	`27`	`tags: list[str] = list(TAGS_SYNTH)`
`28`		`- if fr.gpu:`
`29`		`- tags.append(TAG_GPU)`
	`28`	`+ # if fr.gpu:`
	`29`	`+ # tags.append(TAG_GPU)`
`30`	`30`
`31`	`31`	`pipe = pipeline(`
`32`	`32`	`[`