pasteur-dev
diff --git a/‎docs/source/index.rst‎
Lines changed: 0 additions & 4 deletions b/‎docs/source/index.rst‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/source/tutorial/introduction.ipynb‎
Lines changed: 12 additions & 3 deletions b/‎docs/source/tutorial/introduction.ipynb‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎src/pasteur/__init__.py‎
Lines changed: 7 additions & 1 deletion b/‎src/pasteur/__init__.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/pasteur/dataset.py‎
Lines changed: 7 additions & 3 deletions b/‎src/pasteur/dataset.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎src/pasteur/extras/const.py‎
Lines changed: 0 additions & 123 deletions b/‎src/pasteur/extras/const.py‎
Lines changed: 0 additions & 123 deletions
diff --git a/‎src/pasteur/extras/download.py‎
Lines changed: 2 additions & 116 deletions b/‎src/pasteur/extras/download.py‎
Lines changed: 2 additions & 116 deletions
@@ -35,10 +35,6 @@ This documentation is a work-in-progress!
 
    tutorial/*
 
-.. toctree::
-   :maxdepth: 2
-   :caption: Modules
-
 
 API documentation
 =================
 
@@ -5,8 +5,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Preliminaries\n",
-    "![Methodology overview](res/graphs-pipeline_simple.svg)"
+    "# Introduction\n",
+    "Welcome and thank you for your interest in synthetic data generation!\n",
+    "\n",
+    "In this tutorial, we're going to setup a Pasteur project from scratch, and add\n",
+    "support for a custom dataset with its own evaluation metrics.\n",
+    "Through this tutorial, you will get introduced to Pasteur's architecture and\n",
+    "module system.\n",
+    "\n",
+    "The tutorial begins with detailing Pasteur's architecture and module system.\n",
+    "Afterwards, we create a project and begin integrating and analyzing a new dataset.\n"
    ]
   },
   {
@@ -17,7 +25,8 @@
  ],
  "metadata": {
   "language_info": {
-   "name": "python"
+   "name": "python",
+   "version": "3.10.6"
   }
  },
  "nbformat": 4,
 
@@ -12,4 +12,10 @@
     import importlib_metadata as metadata
 
 version = metadata.version("pasteur")
-__version__ = version
+__version__ = version
+
+def load_ipython_extension(ipython):
+    """ Allows loading ipython functionality with `load_ext pasteur` """
+    from pasteur.kedro.ipython import load_ipython_extension as ld
+
+    ld(ipython)
@@ -6,7 +6,7 @@
 from typing import TYPE_CHECKING, Any, Callable
 
 from .module import Module
-from .utils import LazyChunk, LazyFrame, to_chunked
+from .utils import LazyChunk, LazyFrame, to_chunked, RawSource
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -55,7 +55,7 @@ class Dataset(Module):
     key_deps: list[str] = []
     """ Provides the table dependencies (Table, not raw) that are used to create 
     the keys of the dataset. """
-
+    
     folder_name: str | None = None
     """ Specifies the name of the folder in the raw directory that will be used
     for the dataset's raw sources. If the folder does not exist, the dataset
@@ -65,6 +65,10 @@ class Dataset(Module):
     as a dictionary to be used as is, or as a filepath, in which case
     the path will be loaded and processed, by replacing the paths with appropriate
     ones based on the raw directory and folder name."""
+    raw_sources: dict[str, RawSource] | RawSource | None = None
+    """ A raw source that can be used to download the dataset.
+    
+    Optionally, multiple sources can be supplied and downloaded with `pasteur download <name>`."""
 
     bootstrap: Callable[[str, str], None] | None = None
     """ An optional function that is used for one-time tasks (such as extraction).
@@ -163,4 +167,4 @@ def type(self, table: Any):
 
 
 
-__all__ = ["Dataset", "TabularDataset"]
+__all__ = ["RawSource", "Dataset", "TabularDataset"]
@@ -1,30 +1,10 @@
-import os
-import subprocess
-from typing import NamedTuple
-
-import logging
-from .const import texas_list
-
-logger = logging.getLogger(__name__)
-
-
-class DS(NamedTuple):
-    files: str | list[str]
-    save_name: str | None = None
-    credentials: bool = False
-    desc: str | None = None
-
+from ..utils.download import DS
 
 physio = "requires credentials and license from https://physionet.org"
 
 datasets = {
     # Open Datasets
     "adult": DS("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/"),
-    # Texas, open with license
-    "texas": DS(
-        texas_list,
-        desc="license: https://www.dshs.texas.gov/THCIC/Hospitals/Download.shtm",
-    ),
     # Physionet
     "mimic_iv_1_0": DS(
         "https://physionet.org/files/mimiciv/1.0/", "mimiciv_1_0", True, physio
@@ -44,98 +24,4 @@ class DS(NamedTuple):
         "s3:sdv-datasets",
         desc="license MIT (not clear if that applies to data), requires boto3 package",
     ),
-}
-
-
-def download_files(name: str, dir: str, files: list[str]):
-    if not files:
-        assert False, "Empty file list"
-
-    logger.info(f"Downloading dataset {name} files iteratively with wget.")
-    args = ["wget", "-m", "-np", "-nH", "-c", "-P", dir]
-
-    template_fn = files[0]
-    # We have to skip parent dirs manually
-    cut_dirs = len(template_fn.split("/")) - 4
-    if cut_dirs > 0:
-        args.append(f"--cut-dirs={cut_dirs}")
-
-    args.extend(files)
-    subprocess.run(args)
-
-
-def download_index(
-    name: str, download_dir: str, url_dir: str, username: str | None = None
-):
-    logger.info(f"Downloading dataset {name} through its index listing and wget.")
-    assert url_dir[-1] == "/", "Url dir should end with a `/`"
-
-    args = ["wget", "-m", "-np", "-nH", "-c", "-P", download_dir]
-
-    # We have to skip parent dirs manually
-    cut_dirs = len(url_dir.split("/")) - 4
-    if cut_dirs > 0:
-        args.append(f"--cut-dirs={cut_dirs}")
-
-    args.append(url_dir)
-    if username:
-        args.extend(["--user", username, "--ask-password"])
-    subprocess.run(args)
-
-
-def download_s3(name: str, download_dir: str, bucket: str):
-    try:
-        import boto3
-        from botocore import UNSIGNED
-        from botocore.client import Config
-    except Exception:
-        assert False, "Specified dataset requires the aws package 'boto3'"
-
-    logger.info(f"Downloading dataset {name} from s3 using boto3.")
-    s3 = boto3.resource("s3", config=Config(signature_version=UNSIGNED))
-    ds_bucket = s3.Bucket(bucket)
-
-    for s3_object in ds_bucket.objects.all():
-        _, filename = os.path.split(s3_object.key)
-        fn = os.path.join(download_dir, filename)
-        if os.path.isfile(fn):
-            logger.info(f"File already downloaded, skipping: {filename}")
-            continue
-
-        logger.info(f"Downloading {filename} ({s3_object.size / 1e6:.3f} mb)")
-        ds_bucket.download_file(s3_object.key, fn)
-
-
-def main(download_dir: str, names: list[str], username: str | None):
-    assert os.path.exists(
-        download_dir
-    ), f'Download path "{download_dir}" doesn\'t exist.'
-
-    for name in names:
-        assert name in datasets, f"Dataset {name} not found."
-        ds = datasets[name]
-
-        save_name = ds.save_name or name
-        save_path = os.path.join(download_dir, save_name)
-        os.makedirs(save_path, exist_ok=True)
-
-        if ds.credentials:
-            assert username, f"Dataset requires credentials, use --user <user>"
-
-        if isinstance(ds.files, list):
-            download_files(name, save_path, ds.files)
-        else:
-            assert isinstance(ds.files, str)
-            if ds.files.startswith("s3:"):
-                download_s3(name, save_path, ds.files.replace("s3:", ""))
-            else:
-                download_index(
-                    name, save_path, ds.files, username if ds.credentials else None
-                )
-
-
-def get_description():
-    desc = "The following data stores are available:\n"
-    for name, ds in datasets.items():
-        desc += f"{name:15s}: {ds.desc or ''}\n"
-    return desc
+}