pasteur-dev
diff --git a/‎notebooks/tst.py‎
Lines changed: 220 additions & 0 deletions b/‎notebooks/tst.py‎
Lines changed: 220 additions & 0 deletions
diff --git a/‎src/pasteur/attribute.py‎
Lines changed: 40 additions & 11 deletions b/‎src/pasteur/attribute.py‎
Lines changed: 40 additions & 11 deletions
diff --git a/‎src/pasteur/extras/metrics/distr.py‎
Lines changed: 4 additions & 2 deletions b/‎src/pasteur/extras/metrics/distr.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/pasteur/extras/transformers.py‎
Lines changed: 5 additions & 2 deletions b/‎src/pasteur/extras/transformers.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/pasteur/extras/views/mimic/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/pasteur/extras/views/mimic/__init__.py‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,220 @@
+from typing import cast, Any
+
+import pandas as pd
+from pandas import DataFrame, Series
+
+from pasteur.transform import SeqTransformer, TransformerFactory, Transformer
+from pasteur.module import ModuleFactory, get_module_dict, Module
+from pasteur.attribute import Attribute, Attributes, SeqValue, get_dtype, SeqAttribute, GenAttribute
+from pasteur.extras.transformers import DatetimeTransformer
+
+from project.settings import PASTEUR_MODULES as modules
+
+
+def _backref_cols(
+    ids: pd.DataFrame, seq: pd.Series, data: pd.DataFrame | pd.Series, parent: str
+):
+    # Ref is calculated by mapping each id in data_df by merging its parent
+    # key, sequence number to parent key, and the number - 1 and finding the
+    # corresponding id for that row. Then, a join is performed.
+    _IDX_NAME = "_id_lkjijk"
+    _JOIN_NAME = "_id_zdjwk"
+    ids_seq_prev = ids.join(seq + 1).reset_index(names=_JOIN_NAME)
+    ids_seq = ids.join(seq, how="right").reset_index(names=_IDX_NAME)
+    # FIXME: ids become float
+    join_ids = ids_seq.merge(ids_seq_prev, on=[parent, seq.name], how='left').set_index(_IDX_NAME)[
+        [_JOIN_NAME]
+    ] # type: ignore
+    ref_df = join_ids.join(data, on=_JOIN_NAME).drop(columns=_JOIN_NAME)
+    ref_df.index.name = data.index.name
+    if isinstance(data, pd.Series):
+        return ref_df[data.name]
+    return ref_df
+
+
+def _calculate_seq(data: Series, parent: str, col_seq: str):
+    _ID_SEQ = "_id_sdfasdf"
+    seq = (
+        cast(
+            pd.Series,
+            pd.concat({parent: ids[parent], _ID_SEQ: data}, axis=1)
+            .groupby(parent)[_ID_SEQ]
+            .rank("first"),
+        )
+        - 1
+    )
+    max_len = int(cast(float, seq.max())) + 1
+    return seq.astype(get_dtype(max_len + 1)).rename(col_seq)
+
+
+class SeqTransformerWrapper(SeqTransformer):
+    name = "seqwrap"
+
+    def __init__(
+        self,
+        modules: list[Module],
+        ctx: dict[str, Any],
+        seq: dict[str, Any],
+        parent: str | None = None,
+        seq_col: str | None = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.parent = parent
+        self.seq_col_ref = seq_col
+
+        # Load transformers
+        assert ctx and seq
+        ctx_kwargs = ctx.copy()
+        ctx_type = ctx_kwargs.pop("type")
+        self.ctx = get_module_dict(TransformerFactory, modules)[
+            cast(str, ctx_type)
+        ].build(**ctx_kwargs)
+        assert isinstance(self.ctx, Transformer)
+
+        seq_kwargs = seq.copy()
+        seq_type = seq_kwargs.pop("type")
+        self.seq = get_module_dict(TransformerFactory, modules)[
+            cast(str, seq_type)
+        ].build(**seq_kwargs)
+        assert isinstance(self.seq, RefTransformer)
+
+    def fit(
+        self,
+        table: str,
+        data: Series | DataFrame,
+        ref: dict[str, DataFrame],
+        ids: DataFrame,
+        seq_val: SeqValue | None = None,
+        seq: Series | None = None,
+    ) -> tuple[SeqValue, Series] | None:
+        self.col = cast(str, data.name)
+        self.table = table
+
+        # Grab parent from seq_val if available
+        if seq_val is not None:
+            self.parent = seq_val.table
+            self.col_seq = seq_val.name
+        else:
+            self.col_seq = f"{table}_seq"
+        self.col_n = f'{table}_n'
+
+        if not self.parent:
+            # Infering parent through references
+            self.parent = next(iter(ref))
+        # Process references
+        # if ref:
+        #     self.ref_table = next(iter(ref))
+        #     self.ref_col = cast(str, next(iter(ref[self.ref_table].keys())))
+
+        assert (
+            self.parent
+        ), "Parent table not specified, use parameter 'parent' or a foreign reference."
+
+        # If seq was not provided
+        self.generate_seq = False
+        if seq is None:
+            self.generate_seq = True
+            if isinstance(data, DataFrame):
+                assert self.seq_col_ref is not None, f'Multiple columns are provided as input, specify which one is used sequence the table through parameter `seq_col`.'
+                seq_col = data[self.seq_col_ref]
+            else:
+                seq_col = data
+            seq = _calculate_seq(seq_col, self.parent, self.col_seq)
+        self.max_len = cast(int, seq.max()) + 1
+
+        ctx_data = (
+            ids.join(data[seq == 0], how="right")
+            .drop_duplicates(subset=[self.parent])
+            .set_index(self.parent)[self.col]
+        )
+        if ref:
+            ctx_ref = ids.drop_duplicates(subset=[self.parent])
+            for name, ref_table in ref.items():
+                ctx_ref = ctx_ref.join(ref_table, on=name, how="left")
+            ctx_ref = ctx_ref.set_index(self.parent)
+
+            assert isinstance(
+                self.ctx, RefTransformer
+            ), f"Reference found, initial transformer should be a reference transformer."
+            self.ctx.fit(ctx_data, ctx_ref)
+        else:
+            self.ctx.fit(ctx_data)
+
+        # Data series is all rows where seq > 0 (skip initial)
+        ref_df = _backref_cols(ids, seq, data, self.parent)
+        self.seq.fit(data, ref_df)
+
+        # If a seq_val was not provided, assume seq was also none and
+        # become the sequencer
+        if seq_val is None:
+            return SeqValue(self.col_seq, self.parent), cast(Series, seq)
+
+    def reduce(self, other: "SeqTransformerWrapper"):
+        self.ctx.reduce(other)
+        self.seq.reduce(other)
+        self.max_len = max(other.max_len, self.max_len)
+
+    def transform(
+        self,
+        data: Series | DataFrame,
+        ref: dict[str, DataFrame],
+        ids: DataFrame,
+        seq: Series | None = None,
+    ) -> tuple[DataFrame, dict[str, DataFrame]] | tuple[
+        DataFrame, dict[str, DataFrame], Series
+    ]:
+        parent = cast(str, self.parent)
+        if self.generate_seq:
+            if isinstance(data, DataFrame):
+                assert self.seq_col_ref is not None, f'Multiple columns are provided as input, specify which one is used sequence the table through parameter `seq_col`.'
+                seq_col = data[self.seq_col_ref]
+            else:
+                seq_col = data
+            seq = _calculate_seq(seq_col, parent, self.col_seq)
+        else:
+            assert seq is not None
+
+        ctx_data = (
+            ids.join(data[seq == 0], how="right")
+            .drop_duplicates(subset=[self.parent])
+            .set_index(self.parent)[self.col]
+        )
+        if ref:
+            ctx_ref = ids.drop_duplicates(subset=[self.parent])
+            for name, ref_table in ref.items():
+                ctx_ref = ctx_ref.join(ref_table, on=name, how="left")
+            ctx_ref = ctx_ref.set_index(self.parent)
+
+            if isinstance(ctx_ref, DataFrame) and ctx_ref.shape[1] == 1:
+                ctx_ref = ctx_ref[next(iter(ctx_ref))]
+
+            assert isinstance(
+                self.ctx, RefTransformer
+            ), f"Reference found, initial transformer should be a reference transformer."
+            ctx = self.ctx.transform(ctx_data, ctx_ref)
+        else:
+            ctx = self.ctx.transform(ctx_data)
+
+        # Data series is all rows where seq > 0 (skip initial)
+        ref_df = _backref_cols(ids, seq, data, parent)
+        enc = self.seq.transform(data, ref_df)
+
+        if self.generate_seq:
+            return enc, {parent: pd.concat([ctx, ids.join(seq).groupby(self.parent)[cast(str, seq.name)].max().rename(self.col_n) + 1], axis=1)}, seq
+        return enc, {parent: ctx}
+
+
+    def get_attributes(self) -> tuple[Attributes, dict[str, Attributes]]:
+        return {
+            self.col_seq: SeqAttribute(self.col_seq, cast(str, self.parent)),
+            **self.seq.get_attributes(),
+        }, {cast(str, self.parent): {**self.ctx.get_attributes(), self.col_n: GenAttribute(self.col_n, self.table, self.max_len)}}
+
+
+s = SeqTransformerWrapper(modules, {"type": "datetime", "nullable": True}, {"type": "datetime", "nullable": True})
+s.fit(
+    "admissions", admissions["admittime"], {"patients": patients[["birth_year"]]}, ids
+)
+r = s.transform(admissions["admittime"], {"patients": patients[["birth_year"]]}, ids)
+s.max_len
@@ -44,7 +44,7 @@ def get_dtype(domain: int):
 
 
 class Grouping(list[GI]):
-    """ An enchanced form of list that holds the type of grouping (categorical, ordinal),
+    """An enchanced form of list that holds the type of grouping (categorical, ordinal),
     and implements helper functions and an enchanced string representation."""
 
     def __init__(self, type: Literal["cat", "ord"], arr: list["Grouping | Any"]):
@@ -190,19 +190,28 @@ def from_str(
 
 
 class Value:
-    """ Base value class """
-    name: str | tuple[str] | None = None
+    """Base value class"""
+
+    name: str
     common: int = 0
 
 
+class SeqValue(Value):
+    table: str
+
+    def __init__(self, name: str, table: str) -> None:
+        self.name = name
+        self.table = table
+
+
 class CatValue(Value):
-    """ Class for a Categorical Value.
-    
+    """Class for a Categorical Value.
+
     Each Categorical Value is represented by an unsigned integer.
     It can also group its different values together based on an integer parameter
     named height.
     The implementation of this class remains abstract, and is expanded in
-    the StratifiedValue class. """
+    the StratifiedValue class."""
 
     def get_domain(self, height: int = 0) -> int:
         """Returns the domain of the attribute in the given height."""
@@ -228,7 +237,7 @@ def is_ordinal(self) -> bool:
         return False
 
     def downsample(self, value: np.ndarray, height: int):
-        """ Receives an array named `value` and downsamples it based on the provided
+        """Receives an array named `value` and downsamples it based on the provided
         height, by grouping certain values together. The proper implementation
         is provided by pasteur.hierarchy."""
         if height == 0:
@@ -239,7 +248,7 @@ def upsample(self, value: np.ndarray, height: int, deterministic: bool = True):
         """Does the opposite of downsample. If deterministic is True, for each
         group at a given height one of its values is chosen arbitrarily to represent
         all children of the group.
-        
+
         If deterministic is False, the group is sampled based on this Value's
         histogram (not implemented in this class; see pasteur.hierarchy)."""
         if height == 0:
@@ -263,12 +272,14 @@ def upsample(self, value: np.ndarray, height: int, deterministic: bool = True):
     def select_height(self) -> int:
         return 0
 
+
 IdxValue = CatValue
 
+
 class StratifiedValue(CatValue):
-    """A version of CategoricalValue which uses a Stratification to represent 
-    the domain knowledge of the Value. 
-    
+    """A version of CategoricalValue which uses a Stratification to represent
+    the domain knowledge of the Value.
+
     Each unique value is mapped to a tree
     with nodes where the child order matters.
     By traversing the tree in DFS, each leaf is mapped to an integer."""
@@ -304,6 +315,14 @@ def is_ordinal(self) -> bool:
     def height(self):
         return self.head.height
 
+class GenerationValue(StratifiedValue):
+    table: str
+    max_len: int
+
+    def __init__(self, table: str, max_len: int) -> None:
+        self.table = table
+        self.max_len = max_len
+        super().__init__(Grouping('ord', list(range(max_len))), 0)
 
 def _create_strat_value_cat(vals, na: bool = False, ukn_val: Any | None = None):
     arr = []
@@ -434,6 +453,16 @@ def NumAttribute(
     return Attribute(name, {name: NumValue(bins, min, max)}, nullable, False)
 
 
+def SeqAttribute(name: str, table: str):
+    """Returns an Attribute holding a single SeqValue with the provided data."""
+    return Attribute(name, {name: SeqValue(name, table)}, False, False)
+
+
+def GenAttribute(name: str, table: str, max_len: int):
+    """Returns an Attribute holding a single GenerationValue with the provided data."""
+    return Attribute(name, {name: GenerationValue(table, max_len)}, False, False)
+
+
 __all__ = [
     "get_dtype",
     "Grouping",
 
@@ -10,11 +10,11 @@
 from numpy import ndarray
 from scipy.special import rel_entr
 from scipy.stats import chisquare
-from pasteur.metric import Summaries
 
+from pasteur.metric import Summaries
 from pasteur.utils import LazyDataset
 
-from ...attribute import Attributes, CatValue, get_dtype
+from ...attribute import Attributes, CatValue, SeqValue, get_dtype
 from ...metric import Metric, Summaries
 from ...utils import LazyChunk, LazyFrame, data_to_tables
 from ...utils.progress import process_in_parallel
@@ -209,6 +209,8 @@ def fit(
         for table, attrs in meta.items():
             for attr in attrs.values():
                 for name, val in attr.vals.items():
+                    if isinstance(val, SeqValue):
+                        continue
                     assert isinstance(val, CatValue)
                     self.domain[table][name] = val.domain
 
 
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pandas as pd
-from pandas.api.types import is_categorical_dtype
+from pandas.api.types import is_categorical_dtype, is_float_dtype
 
 from pasteur.attribute import Attributes
 from pasteur.transform import RefTransformer, Transformer
@@ -401,7 +401,7 @@ def reverse(self, data: pd.DataFrame, ref: pd.Series | None = None) -> pd.Series
                 na_mask |= np.any(vals[dcols] == 0, axis=1)
 
             if ref is not None:
-                na_mask |= pd.isna(ref)
+                na_mask = pd.isna(ref) | na_mask
                 ref = ref[~na_mask]
             vals = vals[~na_mask]
             ofs = 1
@@ -646,6 +646,9 @@ def transform(self, data: pd.Series, ref: pd.Series | None = None) -> pd.DataFra
         date_enc = self.dt.transform(data, ref)
         time_enc = self.tt.transform(data)
         del data, ref
+        if self.nullable:
+            c = date_enc[next(iter(date_enc))]
+            time_enc[pd.isna(c) if is_float_dtype(c) else c == 0] = 0
         return pd.concat([date_enc, time_enc], axis=1, copy=False, join="inner")
 
     def reverse(
 
@@ -54,7 +54,7 @@ def ingest(self, name, **tables: LazyChunk):
             case "admissions":
                 return tables["core_admissions"]()
             case "transfers":
-                return tables["core_transfers"]()
+                return tables["core_transfers"]().dropna(subset=['hadm_id'])
             case other:
                 assert False, f"Table {other} not part of view {self.name}"