Skip to content

Commit 1ecda7f

Browse files
committed
make download api extendable
1 parent 5872490 commit 1ecda7f

10 files changed

Lines changed: 172 additions & 258 deletions

File tree

docs/source/index.rst

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,6 @@ This documentation is a work-in-progress!
3535

3636
tutorial/*
3737

38-
.. toctree::
39-
:maxdepth: 2
40-
:caption: Modules
41-
4238

4339
API documentation
4440
=================

docs/source/tutorial/introduction.ipynb

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@
55
"cell_type": "markdown",
66
"metadata": {},
77
"source": [
8-
"# Preliminaries\n",
9-
"![Methodology overview](res/graphs-pipeline_simple.svg)"
8+
"# Introduction\n",
9+
"Welcome and thank you for your interest in synthetic data generation!\n",
10+
"\n",
11+
"In this tutorial, we're going to setup a Pasteur project from scratch, and add\n",
12+
"support for a custom dataset with its own evaluation metrics.\n",
13+
"Through this tutorial, you will get introduced to Pasteur's architecture and\n",
14+
"module system.\n",
15+
"\n",
16+
"The tutorial begins with detailing Pasteur's architecture and module system.\n",
17+
"Afterwards, we create a project and begin integrating and analyzing a new dataset.\n"
1018
]
1119
},
1220
{
@@ -17,7 +25,8 @@
1725
],
1826
"metadata": {
1927
"language_info": {
20-
"name": "python"
28+
"name": "python",
29+
"version": "3.10.6"
2130
}
2231
},
2332
"nbformat": 4,

src/pasteur/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,10 @@
1212
import importlib_metadata as metadata
1313

1414
version = metadata.version("pasteur")
15-
__version__ = version
15+
__version__ = version
16+
17+
def load_ipython_extension(ipython):
18+
""" Allows loading ipython functionality with `load_ext pasteur` """
19+
from pasteur.kedro.ipython import load_ipython_extension as ld
20+
21+
ld(ipython)

src/pasteur/dataset.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from typing import TYPE_CHECKING, Any, Callable
77

88
from .module import Module
9-
from .utils import LazyChunk, LazyFrame, to_chunked
9+
from .utils import LazyChunk, LazyFrame, to_chunked, RawSource
1010

1111
if TYPE_CHECKING:
1212
import pandas as pd
@@ -55,7 +55,7 @@ class Dataset(Module):
5555
key_deps: list[str] = []
5656
""" Provides the table dependencies (Table, not raw) that are used to create
5757
the keys of the dataset. """
58-
58+
5959
folder_name: str | None = None
6060
""" Specifies the name of the folder in the raw directory that will be used
6161
for the dataset's raw sources. If the folder does not exist, the dataset
@@ -65,6 +65,10 @@ class Dataset(Module):
6565
as a dictionary to be used as is, or as a filepath, in which case
6666
the path will be loaded and processed, by replacing the paths with appropriate
6767
ones based on the raw directory and folder name."""
68+
raw_sources: dict[str, RawSource] | RawSource | None = None
69+
""" A raw source that can be used to download the dataset.
70+
71+
Optionally, multiple sources can be supplied and downloaded with `pasteur download <name>`."""
6872

6973
bootstrap: Callable[[str, str], None] | None = None
7074
""" An optional function that is used for one-time tasks (such as extraction).
@@ -163,4 +167,4 @@ def type(self, table: Any):
163167

164168

165169

166-
__all__ = ["Dataset", "TabularDataset"]
170+
__all__ = ["RawSource", "Dataset", "TabularDataset"]

src/pasteur/extras/const.py

Lines changed: 0 additions & 123 deletions
This file was deleted.

src/pasteur/extras/download.py

Lines changed: 2 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,10 @@
1-
import os
2-
import subprocess
3-
from typing import NamedTuple
4-
5-
import logging
6-
from .const import texas_list
7-
8-
logger = logging.getLogger(__name__)
9-
10-
11-
class DS(NamedTuple):
12-
files: str | list[str]
13-
save_name: str | None = None
14-
credentials: bool = False
15-
desc: str | None = None
16-
1+
from ..utils.download import DS
172

183
physio = "requires credentials and license from https://physionet.org"
194

205
datasets = {
216
# Open Datasets
227
"adult": DS("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/"),
23-
# Texas, open with license
24-
"texas": DS(
25-
texas_list,
26-
desc="license: https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm",
27-
),
288
# Physionet
299
"mimic_iv_1_0": DS(
3010
"https://physionet.org/files/mimiciv/1.0/", "mimiciv_1_0", True, physio
@@ -44,98 +24,4 @@ class DS(NamedTuple):
4424
"s3:sdv-datasets",
4525
desc="license MIT (not clear if that applies to data), requires boto3 package",
4626
),
47-
}
48-
49-
50-
def download_files(name: str, dir: str, files: list[str]):
51-
if not files:
52-
assert False, "Empty file list"
53-
54-
logger.info(f"Downloading dataset {name} files iteratively with wget.")
55-
args = ["wget", "-m", "-np", "-nH", "-c", "-P", dir]
56-
57-
template_fn = files[0]
58-
# We have to skip parent dirs manually
59-
cut_dirs = len(template_fn.split("/")) - 4
60-
if cut_dirs > 0:
61-
args.append(f"--cut-dirs={cut_dirs}")
62-
63-
args.extend(files)
64-
subprocess.run(args)
65-
66-
67-
def download_index(
68-
name: str, download_dir: str, url_dir: str, username: str | None = None
69-
):
70-
logger.info(f"Downloading dataset {name} through its index listing and wget.")
71-
assert url_dir[-1] == "/", "Url dir should end with a `/`"
72-
73-
args = ["wget", "-m", "-np", "-nH", "-c", "-P", download_dir]
74-
75-
# We have to skip parent dirs manually
76-
cut_dirs = len(url_dir.split("/")) - 4
77-
if cut_dirs > 0:
78-
args.append(f"--cut-dirs={cut_dirs}")
79-
80-
args.append(url_dir)
81-
if username:
82-
args.extend(["--user", username, "--ask-password"])
83-
subprocess.run(args)
84-
85-
86-
def download_s3(name: str, download_dir: str, bucket: str):
87-
try:
88-
import boto3
89-
from botocore import UNSIGNED
90-
from botocore.client import Config
91-
except Exception:
92-
assert False, "Specified dataset requires the aws package 'boto3'"
93-
94-
logger.info(f"Downloading dataset {name} from s3 using boto3.")
95-
s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
96-
ds_bucket = s3.Bucket(bucket)
97-
98-
for s3_object in ds_bucket.objects.all():
99-
_, filename = os.path.split(s3_object.key)
100-
fn = os.path.join(download_dir, filename)
101-
if os.path.isfile(fn):
102-
logger.info(f"File already downloaded, skipping: {filename}")
103-
continue
104-
105-
logger.info(f"Downloading {filename} ({s3_object.size / 1e6:.3f} mb)")
106-
ds_bucket.download_file(s3_object.key, fn)
107-
108-
109-
def main(download_dir: str, names: list[str], username: str | None):
110-
assert os.path.exists(
111-
download_dir
112-
), f'Download path "{download_dir}" doesn\'t exist.'
113-
114-
for name in names:
115-
assert name in datasets, f"Dataset {name} not found."
116-
ds = datasets[name]
117-
118-
save_name = ds.save_name or name
119-
save_path = os.path.join(download_dir, save_name)
120-
os.makedirs(save_path, exist_ok=True)
121-
122-
if ds.credentials:
123-
assert username, f"Dataset requires credentials, use --user <user>"
124-
125-
if isinstance(ds.files, list):
126-
download_files(name, save_path, ds.files)
127-
else:
128-
assert isinstance(ds.files, str)
129-
if ds.files.startswith("s3:"):
130-
download_s3(name, save_path, ds.files.replace("s3:", ""))
131-
else:
132-
download_index(
133-
name, save_path, ds.files, username if ds.credentials else None
134-
)
135-
136-
137-
def get_description():
138-
desc = "The following data stores are available:\n"
139-
for name, ds in datasets.items():
140-
desc += f"{name:15s}: {ds.desc or ''}\n"
141-
return desc
27+
}

0 commit comments

Comments
 (0)