Skip to content

Commit d20f2fd

Browse files
Add dataset reference to dataset decorator (openproblems-bio#500)
* add dataset reference * add reference to multimodal data * add DOIs for zebrafish, pancreas, immmune * refactor and add reference to 10x 1k * add missing references * document addition to loader decorator
1 parent f923f03 commit d20f2fd

34 files changed

Lines changed: 112 additions & 51 deletions

File tree

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -256,8 +256,8 @@ def f2(adata):
256256

257257
Datasets are loaded under `openproblems/data`. Each data loading function should
258258
download the appropriate dataset from a stable location (e.g. from Figshare) be
259-
decorated with `openproblems.data.utils.loader(data_url="https://data.link")` in order
260-
to cache the result.
259+
decorated with `openproblems.data.utils.loader(data_url="https://data.link",
260+
data_reference="https://doi.org/10.0/123")` in order to cache the result.
261261

262262
Data should be provided in a raw count format. We assume that `adata.X` contains the raw
263263
(count) data for the primary modality; this will also be copied to

openproblems/data/Wagner_2018_zebrafish_embryo_CRISPR.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,10 @@
44
import scprep
55

66

7-
@utils.loader(data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294")
7+
@utils.loader(
8+
data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE112294",
9+
data_reference="https://doi.org/10.1126/science.aar4362",
10+
)
811
def load_zebrafish_chd_tyr(test=False):
912
"""Download zebrafish data from GEO accession GSE112294"""
1013

openproblems/data/cengen.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111
)
1212

1313

14-
@utils.loader(data_url=URL)
14+
@utils.loader(
15+
data_url=URL, data_reference="https://doi.org/10.1016/j.neuron.2018.07.042"
16+
)
1517
def load_cengen(test=False):
1618
"""Download CeNGEN data from GitHub.
1719

openproblems/data/human_blood_nestorowa2016.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
URL = "https://ndownloader.figshare.com/files/25555751"
99

1010

11-
@utils.loader(data_url=URL)
11+
@utils.loader(
12+
data_url=URL, data_reference="https://doi.org/10.1182/blood-2016-05-716480"
13+
)
1214
def load_human_blood_nestorowa2016(test=False):
1315
"""Download Nesterova data from Figshare."""
1416
if test:

openproblems/data/immune_cells.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
URL = "https://ndownloader.figshare.com/files/25717328"
99

1010

11-
@utils.loader(data_url=URL)
11+
@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/s41592-021-01336-8")
1212
def load_immune(test=False):
1313
"""Download immune human data from figshare."""
1414
if test:

openproblems/data/mouse_blood_olssen_labelled.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
URL = "https://ndownloader.figshare.com/files/27346712"
99

1010

11-
@utils.loader(data_url=URL)
11+
@utils.loader(data_url=URL, data_reference="https://doi.org/10.1038/nature19348")
1212
def load_olsson_2016_mouse_blood(test=False):
1313
"""Download Olsson, 2016_mouse_blood, Nature, 2016 data from Figshare."""
1414
if test:

openproblems/data/multimodal/citeseq.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
)
1616

1717

18-
@loader(data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866")
18+
@loader(
19+
data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE100866",
20+
data_reference="https://doi.org/10.1038/nmeth.4380",
21+
)
1922
def load_citeseq_cbmc(test=False):
2023
"""Download CITEseq data from GEO."""
2124
if test:

openproblems/data/multimodal/sample.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111
import scipy.sparse
1212

1313

14-
@loader(data_url="https://openproblems.bio")
14+
@loader(
15+
data_url="https://openproblems.bio",
16+
data_reference="https://github.com/openproblems-bio/openproblems",
17+
)
1518
def load_sample_data(test=True):
1619
"""Create a simple dataset to use for testing in multimodal applications."""
1720
assert test

openproblems/data/multimodal/scicar/base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import scprep
77
import tempfile
88

9+
DATA_REFERENCE = "https://doi.org/10.1126/science.aau0730"
10+
911

1012
def load_scicar(
1113
rna_url,

openproblems/data/multimodal/scicar/cell_lines.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from ...utils import loader
22
from ..utils import subset_joint_data
3+
from .base import DATA_REFERENCE
34
from .base import load_scicar
45

56
rna_url = (
@@ -28,7 +29,10 @@
2829
)
2930

3031

31-
@loader(data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089")
32+
@loader(
33+
data_url="https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE117089",
34+
data_reference=DATA_REFERENCE,
35+
)
3236
def load_scicar_cell_lines(test=False):
3337
"""Download sci-CAR cell lines data from GEO."""
3438
if test:

0 commit comments

Comments
 (0)