fix out-of-core and references for mimic

antheas · antheas · commit e4f1f33fa922 · 2023-08-13T22:59:23.000Z
diff --git a/src/pasteur/extras/datasets/mimic/__init__.py b/src/pasteur/extras/datasets/mimic/__init__.py
@@ -8,7 +8,6 @@
 from ....dataset import Dataset
 from ....utils import (
     LazyChunk,
-    LazyDataset,
     LazyFrame,
     gen_closure,
     get_relative_fn,
@@ -20,24 +19,31 @@
 
 
 def _split_table(
-    chunksize: int, keys: np.ndarray, table: "Callable[..., TextFileReader]"
+    name: str, chunksize: int, keys: np.ndarray, table: "Callable[..., TextFileReader]"
 ):
     pd_keys = pd.DataFrame(index=keys)
     del keys
 
     for chunk in table(chunksize=chunksize):
-        yield chunk.join(pd_keys, on="subject_id", how="inner")
+        c = chunk.join(pd_keys, on="subject_id", how="inner")
+
+        # Fix poe id
+        if name == 'hosp_pharmacy':
+            c['poe_seq'] = c['poe_id'].str[1].astype('Int16')
+            c = c.drop(columns=['poe_id'])
+
+        yield c
 
 
 def _partition_table(
-    table: Callable, patients: LazyFrame, n_partition: int, chunksize: int
+    name: str, table: Callable, patients: LazyFrame, n_partition: int, chunksize: int
 ):
     # Deterministic loading = all tables have the same split
     keys = patients(["subject_id"]).index.to_numpy()
     partitions = np.array_split(keys, n_partition)
 
     return {
-        str(i): gen_closure(_split_table, chunksize, part, table)
+        str(i): gen_closure(_split_table, name, chunksize, part, table)
         for i, part in enumerate(partitions)
     }
 
@@ -97,6 +103,7 @@ def ingest(self, name, **tables: LazyFrame | Callable[[], TextFileReader]):
         if name in self._mimic_tables_partitioned:
             chunksize = self._mimic_tables_partitioned[name]
             return _partition_table(
+                name,
                 cast("Callable[[], TextFileReader]", tables[name]),
                 cast("LazyFrame", tables["core_patients"]),
                 self._n_partitions,
diff --git a/src/pasteur/extras/datasets/mimic/catalog.yml b/src/pasteur/extras/datasets/mimic/catalog.yml
@@ -7,7 +7,7 @@ _mimic_in_csv: &mimic_csv
     sep: ","
     engine: "c"
     header: 0
-    infer_datetime_format: True
+    date_format: "%Y-%m-%d %H:%M:%S"
 
 _mimic_in_chunked: &mimic_chunked # Currently a placeholder
   <<: *mimic_csv
@@ -18,6 +18,7 @@ mimic.raw@core_patients:
   load_args:
     <<: *mimic_csv_load
     index_col: subject_id
+    date_format: "%Y-%m-%d"
     parse_dates: [dod]
     dtype:
       subject_id: int32
@@ -255,17 +256,29 @@ mimic.raw@hosp_poe:
   filepath: ${location}/hosp/poe.csv.gz
   load_args:
     <<: *mimic_csv_load
-    index_col: poe_id
+    index_col: [subject_id, poe_seq] #poe_id
+    usecols:
+      - subject_id
+      - hadm_id
+      - poe_seq
+      - order_status
+      - transaction_type
+      - order_subtype
+      - order_type
+      # - poe_id
+      - discontinue_of_poe_id
+      - discontinued_by_poe_id
+      - ordertime
     parse_dates: [ordertime]
     dtype:
       subject_id: int32
       hadm_id: int32
-      poe_sec: int16
+      poe_seq: int16
       order_status: category
       transaction_type: category
       order_subtype: category
       order_type: category
-      poe_id: object
+      # poe_id: object
       discontinue_of_poe_id: object
       discontinued_by_poe_id: object
 
diff --git a/src/pasteur/extras/metrics/distr.py b/src/pasteur/extras/metrics/distr.py
@@ -223,7 +223,7 @@ def preprocess(
         per_call_meta = []
         base_args = {"domain": self.domain}
 
-        for pid, (cwrk, cref) in LazyDataset.zip([wrk, ref]).items():
+        for cwrk, cref in LazyDataset.zip_values([wrk, ref]):
             for split, split_data in [("wrk", cwrk), ("ref", cref)]:
                 ids, tables = data_to_tables(split_data)
 
@@ -236,7 +236,7 @@ def preprocess(
                             "tables": tables,
                         }
                     )
-                    per_call_meta.append({"split": split, "table": table, "pid": pid})
+                    per_call_meta.append({"split": split, "table": table})
 
         # Process marginals
         out = process_in_parallel(
@@ -287,7 +287,7 @@ def process(
         per_call_meta = []
         base_args = {"domain": self.domain}
 
-        for pid, csyn in LazyDataset.zip(syn).items():
+        for csyn in LazyDataset.zip_values(syn):
             ids, tables = data_to_tables(csyn)
 
             for table in self.domain:
@@ -299,7 +299,7 @@ def process(
                         "tables": tables,
                     }
                 )
-                per_call_meta.append({"table": table, "pid": pid})
+                per_call_meta.append({"table": table})
 
         # Process marginals
         out = process_in_parallel(
diff --git a/src/pasteur/extras/metrics/visual.py b/src/pasteur/extras/metrics/visual.py
@@ -1,4 +1,4 @@
-from typing import Any, NamedTuple, TypeVar
+from typing import Any, NamedTuple, TypeVar, cast
 
 import matplotlib.pyplot as plt
 import numpy as np
@@ -8,10 +8,14 @@
 from pandas.core.frame import DataFrame
 from pandas.core.series import Series
 
-from pasteur.metric import AbstractColumnMetric, RefColumnData, Summaries
-
 from ...metadata import ColumnMeta, Metadata
-from ...metric import ColumnMetric, RefColumnMetric, Summaries
+from ...metric import (
+    AbstractColumnMetric,
+    ColumnMetric,
+    RefColumnData,
+    RefColumnMetric,
+    Summaries,
+)
 from ...utils import list_unique
 from ...utils.mlflow import load_matplotlib_style, mlflow_log_hists
 
@@ -263,9 +267,9 @@ class DateData(NamedTuple):
 class DateHist(RefColumnMetric[Summaries[DateData], Summaries[DateData]]):
     name = "date"
 
-    def fit(
-        self, table: str, col: str, meta: ColumnMeta, data: pd.Series, ref: pd.Series
-    ):
+    def fit(self, table: str, col: str | tuple[str], meta: ColumnMeta, data: RefColumnData):
+        ref = data['ref']
+        data = data['data']
         self.table = table
         self.col = col
 
@@ -418,7 +422,7 @@ def process(
         syn: RefColumnData,
         pre: Summaries[DateData],
     ) -> Summaries[DateData]:
-        return pre.replace(syn=self._process(syn["wrk"], syn["ref"]))  # type: ignore
+        return pre.replace(syn=self._process(syn["data"], syn["ref"]))  # type: ignore
 
     def combine(self, summaries: list[Summaries[DateData]]) -> Summaries[DateData]:
         return Summaries(
@@ -596,12 +600,12 @@ def __init__(self, *args, _from_factory: bool = False, **kwargs) -> None:
         self.time = TimeHist(*args, _from_factory=_from_factory, **kwargs)
 
     def fit(
-        self, table: str, col: str, meta: ColumnMeta, data: pd.Series, ref: pd.Series
+        self, table: str, col: str, meta: ColumnMeta, data: RefColumnData
     ):
         self.table = table
         self.col = col
-        self.date.fit(table=table, col=col, meta=meta, data=data, ref=ref)
-        self.time.fit(table=table, col=col, meta=meta, data=data)
+        self.date.fit(table=table, col=col, meta=meta, data=data)
+        self.time.fit(table=table, col=col, meta=meta, data=cast(pd.Series, data['data']))
 
     def preprocess(
         self, wrk: RefColumnData, ref: RefColumnData
diff --git a/src/pasteur/extras/transformers.py b/src/pasteur/extras/transformers.py
@@ -589,8 +589,11 @@ def fit(
     ):
         self.col = cast(str, data.name)
 
-        cdt = self.dt.fit(data, ref)
-        ctt = self.tt.fit(data)
+        self.dt.fit(data, ref)
+        self.tt.fit(data)
+
+        cdt = next(iter(self.dt.get_attributes().values()))
+        ctt = next(iter(self.tt.get_attributes().values()))
         self.attr = Attribute(self.col, vals={**cdt.vals, **ctt.vals}, na=self.nullable)
 
     def get_attributes(self) -> Attributes:
diff --git a/src/pasteur/kedro/dataset/auto.py b/src/pasteur/kedro/dataset/auto.py
@@ -67,7 +67,7 @@ def _save_worker(
         dtypes = p0.dtypes
         for field in old_schema:
             if (
-                isinstance(field.type, pa.dictionaryType)
+                isinstance(field.type, pa.DictionaryType)
                 and field.type.index_type.bit_width == 8
             ):
                 # Expand uint8 dictionaries to uint16
@@ -114,7 +114,7 @@ def _save_worker(
 
             for p in chunk:  # type: ignore
                 try:
-                    w.write(pa.Table.from_pandas(p, schema=schema))
+                    w.write(pa.Table.from_pandas(p, schema=schema, preserve_index=True))
                 except Exception as e:
                     logger.error(f"Error writing chunk:\n{e}")
     else:
diff --git a/src/pasteur/kedro/dataset/multi.py b/src/pasteur/kedro/dataset/multi.py
@@ -74,10 +74,14 @@ def _normalized_path(self) -> str:
         return self._path
 
     def _list_partitions(self) -> list[str]:
+        if not self._filesystem.isdir(self._normalized_path, **self._load_args):
+            # If the path does not exist, ie no datasets were saved before
+            # return no partitions instead of crashing
+            return []
         return [
-            path
-            for path in self._filesystem.find(self._normalized_path, **self._load_args)
-            if path.endswith(self._filename_suffix)
+            path['name']
+            for path in self._filesystem.listdir(self._normalized_path, **self._load_args)
+            if path['name'].endswith(self._filename_suffix)
         ]
 
     def _join_protocol(self, path: str) -> str:
diff --git a/src/pasteur/kedro/runner/common.py b/src/pasteur/kedro/runner/common.py
@@ -33,7 +33,6 @@ def run_expanded_node(
     node_name = node.name.split("(")[0]
     set_node_name(node_name)
     try:
-
         t = PerformanceTracker.get("nodes")
         t.log_to_file()
         t.start(node_name)
diff --git a/src/pasteur/kedro/runner/sequential.py b/src/pasteur/kedro/runner/sequential.py
@@ -21,7 +21,7 @@
 
 class SimpleSequentialRunner(AbstractRunner):
     """``SimpleRunner`` is a modification of ``SequentialRunner`` that uses a TQDM
-    loading bar. It also force enables async save of datasets.
+    loading bar.
     """
 
     def __init__(
diff --git a/src/pasteur/metadata.py b/src/pasteur/metadata.py
@@ -48,7 +48,7 @@ def __init__(self, **kwargs):
 
         # Ref can be set both by the ref keyword or by extended syntax
         ref = type_ref[1] if len(type_ref) > 1 else None
-        refs = kwargs.get("ref", kwargs.get("refs", ref))
+        ref = kwargs.get("ref", kwargs.get("refs", ref))
             
         # Basic type and dtype data
         self.type = type
diff --git a/src/pasteur/metric.py b/src/pasteur/metric.py
diff --git a/src/pasteur/synth.py b/src/pasteur/synth.py
diff --git a/src/pasteur/table.py b/src/pasteur/table.py
diff --git a/src/pasteur/utils/data.py b/src/pasteur/utils/data.py
diff --git a/src/pasteur/utils/progress.py b/src/pasteur/utils/progress.py