@@ -42,68 +42,81 @@ class Dataset(Module):
4242 @Warning: having a table named raw is not allowed."""
4343
4444 deps : dict [str , list [str ]] = {}
45+ """ Defines the Tables of the dataset and their dependencies, ex.:
46+
47+ ```python
48+ {"table1": ["raw1", "raw2"], "table2": ["raw3", "raw4"]}
49+ ```
50+ """
51+
4552 key_deps : list [str ] = []
53+ """ Provides the table dependencies (Table, not raw) that are used to create
54+ the keys of the dataset. """
4655
4756 folder_name : str | None = None
57+ """ Specifies the name of the folder in the raw directory that will be used
58+ for the dataset's raw sources. If the folder does not exist, the dataset
59+ is disabled (used for packaging)."""
4860 catalog : dict [str , Any ] | str | None = None
61+ """ A kedro catalog that represents the dataset's sources. Can be provided
62+ as a dictionary to be used as is, or as a filepath, in which case
63+ the path will be loaded and processed, by replacing the paths with appropriate
64+ ones based on the raw directory and folder name."""
65+
4966 bootstrap : Callable [[str , str ], None ] | None = None
67+ """ An optional function that is used for one-time tasks (such as extraction).
68+ Can be run with `pasteur bootstrap <dataset_name>`.
69+
70+ Is provided with 2 paths: the raw directory of the dataset and another
71+ directory dedicated to the dataset named bootstrap.
72+ If the dataset has any archives, extract them from the raw directory to
73+ bootstrap and then use the bootstrap directory as a base in the catalog."""
5074
5175 def __init__ (self , ** _ ) -> None :
5276 pass
5377
5478 @property
5579 def raw_tables (self ):
80+ """Returns the raw dependency names of the dataset."""
5681 from functools import reduce
5782
5883 return list (dict .fromkeys (reduce (lambda a , b : a + b , self .deps .values (), [])))
5984
6085 @property
6186 def tables (self ):
87+ """Returns the table names of the dataset."""
6288 return list (self .deps .keys ())
6389
6490 def ingest (self , name , ** tables : Any ) -> LazyFrame :
6591 """Creates the table <name> using the tables provided based on the dependencies.
6692
67- The dependencies may be any and should be defined in the catalog.
93+ The dependencies may be anything and should be defined in the catalog.
6894 The raw tables of a dataset are the only kedro datasets explicitly
6995 defined by the user.
7096
7197 Can return a dataframe, callable which produces a dataframe, or dict of callables, dataframes.
7298 If it's a dict, the table will be partitioned using the dict keys.
7399
74100 @warning: all partitioned tables should have the same partitions.
75- Some tables may not be partitioned."""
101+ Some tables may not be partitioned.
102+
103+ Tip: use a `match` statement to fork based on table name to per-table functions."""
76104 raise NotImplemented ()
77105
78106 def keys (self , ** tables : LazyFrame ) -> pd .DataFrame :
79- """Returns a set of keys which split the current dataset (or partition) .
107+ """Returns a set of keys which split the current dataset.
80108
81109 Keys do not need to be unique per partition, since splitting will also
82110 be partition based.
111+ Gets a set of table partitions based on `key_deps`.
83112
84- Gets a set of table partitions based on `key_deps`. All tables are the
85- same partition. If a table is not partitioned, it's the whole DataFrame.
86-
87- Shouldn't return a callable."""
113+ Use the `to_chunked` operator to handle partitions."""
88114 raise NotImplemented ()
89115
90116 def __str__ (self ) -> str :
91117 return self .name
92118
93119
94- class TypedDataset (Dataset ):
95- """Extend from to create an intermediary step in ingestion, where the table
96- is loaded from `<dataset>.raw@<table>` to a parquet one `<dataset>.typed.<table>.
97-
98- Useful for multiple reads to raw tables. You can also override the `type()` function to make
99- minor changes to the dataset. By default it's the identity.
100-
101- Since parquet files don't support chunked loading it's unused."""
102-
103- def type (self , table : Any ):
104- return table
105-
106-
107120class TabularDataset (Dataset ):
108121 """Boilerplate for a tabular dataset. Assumes the dataset contains one table
109122 named `table`, the index of which is the keys.
0 commit comments