fix view docs

antheas · antheas · commit c969d9e046fa · 2023-03-18T10:57:09.000Z
diff --git a/src/pasteur/dataset.py b/src/pasteur/dataset.py
@@ -42,68 +42,81 @@ class Dataset(Module):
     @Warning: having a table named raw is not allowed."""
 
     deps: dict[str, list[str]] = {}
+    """ Defines the Tables of the dataset and their dependencies, ex.:
+    
+    ```python
+    {"table1": ["raw1", "raw2"], "table2": ["raw3", "raw4"]}
+    ```
+    """
+
     key_deps: list[str] = []
+    """ Provides the table dependencies (Table, not raw) that are used to create 
+    the keys of the dataset. """
 
     folder_name: str | None = None
+    """ Specifies the name of the folder in the raw directory that will be used
+    for the dataset's raw sources. If the folder does not exist, the dataset
+    is disabled (used for packaging)."""
     catalog: dict[str, Any] | str | None = None
+    """ A kedro catalog that represents the dataset's sources. Can be provided
+    as a dictionary to be used as is, or as a filepath, in which case
+    the path will be loaded and processed, by replacing the paths with appropriate
+    ones based on the raw directory and folder name."""
+
     bootstrap: Callable[[str, str], None] | None = None
+    """ An optional function that is used for one-time tasks (such as extraction).
+    Can be run with `pasteur bootstrap <dataset_name>`. 
+    
+    Is provided with 2 paths: the raw directory of the dataset and another 
+    directory dedicated to the dataset named bootstrap.
+    If the dataset has any archives, extract them from the raw directory to 
+    bootstrap and then use the bootstrap directory as a base in the catalog."""
 
     def __init__(self, **_) -> None:
         pass
 
     @property
     def raw_tables(self):
+        """Returns the raw dependency names of the dataset."""
         from functools import reduce
 
         return list(dict.fromkeys(reduce(lambda a, b: a + b, self.deps.values(), [])))
 
     @property
     def tables(self):
+        """Returns the table names of the dataset."""
         return list(self.deps.keys())
 
     def ingest(self, name, **tables: Any) -> LazyFrame:
         """Creates the table <name> using the tables provided based on the dependencies.
 
-        The dependencies may be any and should be defined in the catalog.
+        The dependencies may be anything and should be defined in the catalog.
         The raw tables of a dataset are the only kedro datasets explicitly
         defined by the user.
 
         Can return a dataframe, callable which produces a dataframe, or dict of callables, dataframes.
         If it's a dict, the table will be partitioned using the dict keys.
 
         @warning: all partitioned tables should have the same partitions.
-        Some tables may not be partitioned."""
+        Some tables may not be partitioned.
+        
+        Tip: use a `match` statement to fork based on table name to per-table functions."""
         raise NotImplemented()
 
     def keys(self, **tables: LazyFrame) -> pd.DataFrame:
-        """Returns a set of keys which split the current dataset (or partition).
+        """Returns a set of keys which split the current dataset.
 
         Keys do not need to be unique per partition, since splitting will also
         be partition based.
+        Gets a set of table partitions based on `key_deps`.
 
-        Gets a set of table partitions based on `key_deps`. All tables are the
-        same partition. If a table is not partitioned, it's the whole DataFrame.
-
-        Shouldn't return a callable."""
+        Use the `to_chunked` operator to handle partitions."""
         raise NotImplemented()
 
     def __str__(self) -> str:
         return self.name
 
 
-class TypedDataset(Dataset):
-    """Extend from to create an intermediary step in ingestion, where the table
-    is loaded from `<dataset>.raw@<table>` to a parquet one `<dataset>.typed.<table>.
-
-    Useful for multiple reads to raw tables. You can also override the `type()` function to make
-    minor changes to the dataset. By default it's the identity.
-
-    Since parquet files don't support chunked loading it's unused."""
-
-    def type(self, table: Any):
-        return table
-
-
 class TabularDataset(Dataset):
     """Boilerplate for a tabular dataset. Assumes the dataset contains one table
     named `table`, the index of which is the keys.
diff --git a/src/pasteur/kedro/pipelines/views.py b/src/pasteur/kedro/pipelines/views.py
@@ -35,8 +35,8 @@ def create_view_pipeline(view: View):
         pipeline(
             [
                 node(
-                    func=view.ingest,
-                    name=f"ingest_{t}",
+                    func=view.query,
+                    name=f"query_{t}",
                     args=[t],
                     inputs={dep: f"{view.dataset}.{dep}" for dep in view.deps[t]},
                     namespace=f"{view}.view",
diff --git a/src/pasteur/view.py b/src/pasteur/view.py
@@ -141,27 +141,40 @@ class View(Module):
     Use `utils.get_relative_fn()` from datasets."""
 
     dataset: str
+    """The name of the View's Dataset. If the Dataset is not loaded, the View
+    is disabled."""
+
     deps: dict[str, list[str]] = {}
+    """ Defines the Tables of the View and their Dataset dependencies, ex.:
+    
+    ```python
+    {"table1": ["master_table1", "master_table2"], "table2": ["master_table3"]}
+    ```
+    """
     trn_deps: dict[str, list[str]] = {}
     parameters: dict[str, Any] | str | None = None
-    tabular: bool = False
 
     def __init__(self, **_) -> None:
         pass
 
     @property
     def dataset_tables(self):
+        """Returns the dataset tables required by the View."""
         from functools import reduce
 
         return list(dict.fromkeys(reduce(lambda a, b: a + b, self.deps.values(), [])))
 
     @property
     def tables(self):
+        """Returns the table names of the view."""
         return list(self.deps.keys())
-
-    def ingest(self, name, **tables: LazyFrame):
-        """Creates the table <name> using the tables provided based on the dependencies."""
-        raise NotImplementedError()
+    
+    def query(self, name, **tables: LazyFrame):
+        """ Equivalent to ingest in Dataset. """
+        if hasattr(self, "ingest"):
+            # Original name for function was ingest.
+            return getattr(self, "ingest")(name, **tables)
+        raise NotImplemented()
 
     def split_keys(
         self,
@@ -187,13 +200,15 @@ def __str__(self) -> str:
 
 
 class TabularView(View):
+    """Boilerplate for views that are based on tabular datasets.
+    Has one table, named `table`, which is a copy of the table `table` of its
+    Dataset."""
     deps = {"table": ["table"]}
-    tabular: bool = True
 
     @to_chunked
     def ingest(self, name, **tables: LazyChunk):
         assert name == "table"
         return tables["table"]()
 
 
-__all__ = ["View", "TabularView", "filter_by_keys"]
+__all__ = ["View", "TabularView", "split_keys", "filter_by_keys"]