Skip to content

Commit 091bfb8

Browse files
danielStroblscottgigante-immunaigithub-actions[bot]
authored
add new dataloaders (#792)
* add new dataloaders * task dataloaders * address comments * correct docstring * change dataset name * remove whitespace * add to init * pre-commit * filter celltypes * pre-commit * filter celltypes * pre-commit * update urllib requirement * pre-commit * urllib * Use scprep 1.2.2 * remove immune human mouse for now * Remove urllib dep * Remove broken import --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Former-commit-id: c366d94
1 parent b9e0772 commit 091bfb8

7 files changed

Lines changed: 90 additions & 1 deletion

File tree

openproblems/data/lung.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from . import utils
2+
3+
import os
4+
import scprep
5+
import tempfile
6+
7+
# sparsified from https://figshare.com/articles/dataset/Benchmarking_atlas-level_data_integration_in_single-cell_genomics_-_integration_task_datasets_Immune_and_pancreas_/12420968/2 # noqa: E501
8+
URL = "https://figshare.com/ndownloader/files/24539942"
9+
10+
11+
@utils.loader(data_url=URL, data_reference="luecken2022benchmarking")
12+
def load_lung(test=False):
13+
"""Download lung data from figshare."""
14+
import scanpy as sc
15+
16+
if test:
17+
# load full data first, cached if available
18+
adata = load_lung(test=False)
19+
20+
# Subsample immune data to two batches with 250 cells each
21+
adata = adata[:, :500].copy()
22+
batch1 = adata[adata.obs.batch == "4"][:250]
23+
batch2 = adata[adata.obs.batch == "A6"][:250]
24+
adata = batch1.concatenate(batch2)
25+
# Note: could also use 200-500 HVGs rather than 200 random genes
26+
27+
# Ensure there are no cells or genes with 0 counts
28+
utils.filter_genes_cells(adata)
29+
30+
return adata
31+
32+
else:
33+
with tempfile.TemporaryDirectory() as tempdir:
34+
filepath = os.path.join(tempdir, "Lung_atlas_public.h5ad")
35+
scprep.io.download.download_url(URL, filepath)
36+
adata = sc.read(filepath)
37+
38+
# NOTE: adata.X contains log-normalized data, so we're moving it
39+
adata.layers["log_normalized"] = adata.X
40+
adata.X = adata.layers["counts"]
41+
42+
# Ensure there are no cells or genes with 0 counts
43+
utils.filter_genes_cells(adata)
44+
45+
return adata
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from .immune import immune_batch
2+
from .lung import lung_batch
23
from .pancreas import pancreas_batch
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from .....data.lung import load_lung
2+
from .....tools.decorators import dataset
3+
from ..utils import filter_celltypes
4+
from typing import Optional
5+
6+
7+
@dataset(
8+
dataset_name="Lung (Viera Braga et al.)",
9+
data_url=load_lung.metadata["data_url"],
10+
data_reference=load_lung.metadata["data_reference"],
11+
dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells."
12+
"From Vieira Braga et al. Technologies: 10X and Drop-seq.",
13+
image="openproblems",
14+
)
15+
def lung_batch(test: bool = False, min_celltype_count: Optional[int] = None):
16+
import scanpy as sc
17+
18+
adata = load_lung(test)
19+
adata.uns["organism"] = "human"
20+
adata.obs["labels"] = adata.obs["cell_type"]
21+
# No need to rename batch column as it already exists
22+
23+
adata = filter_celltypes(adata, min_celltype_count=min_celltype_count)
24+
25+
sc.pp.filter_genes(adata, min_counts=1)
26+
sc.pp.filter_genes(adata, min_cells=1)
27+
28+
adata.X = adata.layers["log_normalized"]
29+
30+
sc.tl.pca(
31+
adata,
32+
svd_solver="arpack",
33+
return_info=True,
34+
)
35+
adata.obsm["X_uni_pca"] = adata.obsm["X_pca"]
36+
37+
sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni")
38+
39+
adata.var_names_make_unique()
40+
return adata
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from ..._common.datasets.immune import immune_batch
2+
from ..._common.datasets.lung import lung_batch
23
from ..._common.datasets.pancreas import pancreas_batch
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from ..._common.datasets.immune import immune_batch
2+
from ..._common.datasets.lung import lung_batch
23
from ..._common.datasets.pancreas import pancreas_batch
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
from ..._common.datasets.immune import immune_batch
2+
from ..._common.datasets.lung import lung_batch
23
from ..._common.datasets.pancreas import pancreas_batch

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"numpy>=1.21,<1.24",
88
"scikit-learn>=1.0,<1.2",
99
"anndata==0.8.*",
10-
"scprep>=1.2.1",
10+
"scprep>=1.2.2",
1111
"scipy>=1.7,<1.10",
1212
"scanpy>=1.6",
1313
"louvain==0.8.*",

0 commit comments

Comments
 (0)