Skip to content

Commit c969d9e

Browse files
committed
fix view docs
1 parent a8dd683 commit c969d9e

3 files changed

Lines changed: 57 additions & 29 deletions

File tree

src/pasteur/dataset.py

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -42,68 +42,81 @@ class Dataset(Module):
4242
@Warning: having a table named raw is not allowed."""
4343

4444
deps: dict[str, list[str]] = {}
45+
""" Defines the Tables of the dataset and their dependencies, ex.:
46+
47+
```python
48+
{"table1": ["raw1", "raw2"], "table2": ["raw3", "raw4"]}
49+
```
50+
"""
51+
4552
key_deps: list[str] = []
53+
""" Provides the table dependencies (Table, not raw) that are used to create
54+
the keys of the dataset. """
4655

4756
folder_name: str | None = None
57+
""" Specifies the name of the folder in the raw directory that will be used
58+
for the dataset's raw sources. If the folder does not exist, the dataset
59+
is disabled (used for packaging)."""
4860
catalog: dict[str, Any] | str | None = None
61+
""" A kedro catalog that represents the dataset's sources. Can be provided
62+
as a dictionary to be used as is, or as a filepath, in which case
63+
the path will be loaded and processed, by replacing the paths with appropriate
64+
ones based on the raw directory and folder name."""
65+
4966
bootstrap: Callable[[str, str], None] | None = None
67+
""" An optional function that is used for one-time tasks (such as extraction).
68+
Can be run with `pasteur bootstrap <dataset_name>`.
69+
70+
Is provided with 2 paths: the raw directory of the dataset and another
71+
directory dedicated to the dataset named bootstrap.
72+
If the dataset has any archives, extract them from the raw directory to
73+
bootstrap and then use the bootstrap directory as a base in the catalog."""
5074

5175
def __init__(self, **_) -> None:
5276
pass
5377

5478
@property
5579
def raw_tables(self):
80+
"""Returns the raw dependency names of the dataset."""
5681
from functools import reduce
5782

5883
return list(dict.fromkeys(reduce(lambda a, b: a + b, self.deps.values(), [])))
5984

6085
@property
6186
def tables(self):
87+
"""Returns the table names of the dataset."""
6288
return list(self.deps.keys())
6389

6490
def ingest(self, name, **tables: Any) -> LazyFrame:
6591
"""Creates the table <name> using the tables provided based on the dependencies.
6692
67-
The dependencies may be any and should be defined in the catalog.
93+
The dependencies may be anything and should be defined in the catalog.
6894
The raw tables of a dataset are the only kedro datasets explicitly
6995
defined by the user.
7096
7197
Can return a dataframe, callable which produces a dataframe, or dict of callables, dataframes.
7298
If it's a dict, the table will be partitioned using the dict keys.
7399
74100
@warning: all partitioned tables should have the same partitions.
75-
Some tables may not be partitioned."""
101+
Some tables may not be partitioned.
102+
103+
Tip: use a `match` statement to fork based on table name to per-table functions."""
76104
raise NotImplemented()
77105

78106
def keys(self, **tables: LazyFrame) -> pd.DataFrame:
79-
"""Returns a set of keys which split the current dataset (or partition).
107+
"""Returns a set of keys which split the current dataset.
80108
81109
Keys do not need to be unique per partition, since splitting will also
82110
be partition based.
111+
Gets a set of table partitions based on `key_deps`.
83112
84-
Gets a set of table partitions based on `key_deps`. All tables are the
85-
same partition. If a table is not partitioned, it's the whole DataFrame.
86-
87-
Shouldn't return a callable."""
113+
Use the `to_chunked` operator to handle partitions."""
88114
raise NotImplemented()
89115

90116
def __str__(self) -> str:
91117
return self.name
92118

93119

94-
class TypedDataset(Dataset):
95-
"""Extend from to create an intermediary step in ingestion, where the table
96-
is loaded from `<dataset>.raw@<table>` to a parquet one `<dataset>.typed.<table>.
97-
98-
Useful for multiple reads to raw tables. You can also override the `type()` function to make
99-
minor changes to the dataset. By default it's the identity.
100-
101-
Since parquet files don't support chunked loading it's unused."""
102-
103-
def type(self, table: Any):
104-
return table
105-
106-
107120
class TabularDataset(Dataset):
108121
"""Boilerplate for a tabular dataset. Assumes the dataset contains one table
109122
named `table`, the index of which is the keys.

src/pasteur/kedro/pipelines/views.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ def create_view_pipeline(view: View):
3535
pipeline(
3636
[
3737
node(
38-
func=view.ingest,
39-
name=f"ingest_{t}",
38+
func=view.query,
39+
name=f"query_{t}",
4040
args=[t],
4141
inputs={dep: f"{view.dataset}.{dep}" for dep in view.deps[t]},
4242
namespace=f"{view}.view",

src/pasteur/view.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -141,27 +141,40 @@ class View(Module):
141141
Use `utils.get_relative_fn()` from datasets."""
142142

143143
dataset: str
144+
"""The name of the View's Dataset. If the Dataset is not loaded, the View
145+
is disabled."""
146+
144147
deps: dict[str, list[str]] = {}
148+
""" Defines the Tables of the View and their Dataset dependencies, ex.:
149+
150+
```python
151+
{"table1": ["master_table1", "master_table2"], "table2": ["master_table3"]}
152+
```
153+
"""
145154
trn_deps: dict[str, list[str]] = {}
146155
parameters: dict[str, Any] | str | None = None
147-
tabular: bool = False
148156

149157
def __init__(self, **_) -> None:
150158
pass
151159

152160
@property
153161
def dataset_tables(self):
162+
"""Returns the dataset tables required by the View."""
154163
from functools import reduce
155164

156165
return list(dict.fromkeys(reduce(lambda a, b: a + b, self.deps.values(), [])))
157166

158167
@property
159168
def tables(self):
169+
"""Returns the table names of the view."""
160170
return list(self.deps.keys())
161-
162-
def ingest(self, name, **tables: LazyFrame):
163-
"""Creates the table <name> using the tables provided based on the dependencies."""
164-
raise NotImplementedError()
171+
172+
def query(self, name, **tables: LazyFrame):
173+
""" Equivalent to ingest in Dataset. """
174+
if hasattr(self, "ingest"):
175+
# Original name for function was ingest.
176+
return getattr(self, "ingest")(name, **tables)
177+
raise NotImplemented()
165178

166179
def split_keys(
167180
self,
@@ -187,13 +200,15 @@ def __str__(self) -> str:
187200

188201

189202
class TabularView(View):
203+
"""Boilerplate for views that are based on tabular datasets.
204+
Has one table, named `table`, which is a copy of the table `table` of its
205+
Dataset."""
190206
deps = {"table": ["table"]}
191-
tabular: bool = True
192207

193208
@to_chunked
194209
def ingest(self, name, **tables: LazyChunk):
195210
assert name == "table"
196211
return tables["table"]()
197212

198213

199-
__all__ = ["View", "TabularView", "filter_by_keys"]
214+
__all__ = ["View", "TabularView", "split_keys", "filter_by_keys"]

0 commit comments

Comments
 (0)