add new dataloaders (#792)

danielStrobl · scottgigante-immunai · github-actions[bot] · web-flow · commit 091bfb86363c · 2023-02-02T08:14:21.000+11:00
* add new dataloaders * task dataloaders * address comments * correct docstring * change dataset name * remove whitespace * add to init * pre-commit * filter celltypes * pre-commit * filter celltypes * pre-commit * update urllib requirement * pre-commit * urllib * Use scprep 1.2.2 * remove immune human mouse for now * Remove urllib dep * Remove broken import --------- Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Former-commit-id: c366d94
diff --git a/openproblems/data/lung.py b/openproblems/data/lung.py
@@ -0,0 +1,45 @@
+from . import utils
+
+import os
+import scprep
+import tempfile
+
+# sparsified from https://figshare.com/articles/dataset/Benchmarking_atlas-level_data_integration_in_single-cell_genomics_-_integration_task_datasets_Immune_and_pancreas_/12420968/2 # noqa: E501
+URL = "https://figshare.com/ndownloader/files/24539942"
+
+
+@utils.loader(data_url=URL, data_reference="luecken2022benchmarking")
+def load_lung(test=False):
+    """Download lung data from figshare."""
+    import scanpy as sc
+
+    if test:
+        # load full data first, cached if available
+        adata = load_lung(test=False)
+
+        # Subsample immune data to two batches with 250 cells each
+        adata = adata[:, :500].copy()
+        batch1 = adata[adata.obs.batch == "4"][:250]
+        batch2 = adata[adata.obs.batch == "A6"][:250]
+        adata = batch1.concatenate(batch2)
+        # Note: could also use 200-500 HVGs rather than 200 random genes
+
+        # Ensure there are no cells or genes with 0 counts
+        utils.filter_genes_cells(adata)
+
+        return adata
+
+    else:
+        with tempfile.TemporaryDirectory() as tempdir:
+            filepath = os.path.join(tempdir, "Lung_atlas_public.h5ad")
+            scprep.io.download.download_url(URL, filepath)
+            adata = sc.read(filepath)
+
+            # NOTE: adata.X contains log-normalized data, so we're moving it
+            adata.layers["log_normalized"] = adata.X
+            adata.X = adata.layers["counts"]
+
+            # Ensure there are no cells or genes with 0 counts
+            utils.filter_genes_cells(adata)
+
+        return adata
diff --git a/openproblems/tasks/_batch_integration/_common/datasets/__init__.py b/openproblems/tasks/_batch_integration/_common/datasets/__init__.py
@@ -1,2 +1,3 @@
 from .immune import immune_batch
+from .lung import lung_batch
 from .pancreas import pancreas_batch
diff --git a/openproblems/tasks/_batch_integration/_common/datasets/lung.py b/openproblems/tasks/_batch_integration/_common/datasets/lung.py
@@ -0,0 +1,40 @@
+from .....data.lung import load_lung
+from .....tools.decorators import dataset
+from ..utils import filter_celltypes
+from typing import Optional
+
+
+@dataset(
+    dataset_name="Lung (Viera Braga et al.)",
+    data_url=load_lung.metadata["data_url"],
+    data_reference=load_lung.metadata["data_reference"],
+    dataset_summary="Human lung scRNA-seq data from 3 datasets with 32,472 cells."
+    "From Vieira Braga et al. Technologies: 10X and Drop-seq.",
+    image="openproblems",
+)
+def lung_batch(test: bool = False, min_celltype_count: Optional[int] = None):
+    import scanpy as sc
+
+    adata = load_lung(test)
+    adata.uns["organism"] = "human"
+    adata.obs["labels"] = adata.obs["cell_type"]
+    # No need to rename batch column as it already exists
+
+    adata = filter_celltypes(adata, min_celltype_count=min_celltype_count)
+
+    sc.pp.filter_genes(adata, min_counts=1)
+    sc.pp.filter_genes(adata, min_cells=1)
+
+    adata.X = adata.layers["log_normalized"]
+
+    sc.tl.pca(
+        adata,
+        svd_solver="arpack",
+        return_info=True,
+    )
+    adata.obsm["X_uni_pca"] = adata.obsm["X_pca"]
+
+    sc.pp.neighbors(adata, use_rep="X_uni_pca", key_added="uni")
+
+    adata.var_names_make_unique()
+    return adata
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/datasets/__init__.py
@@ -1,2 +1,3 @@
 from ..._common.datasets.immune import immune_batch
+from ..._common.datasets.lung import lung_batch
 from ..._common.datasets.pancreas import pancreas_batch
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/datasets/__init__.py
@@ -1,2 +1,3 @@
 from ..._common.datasets.immune import immune_batch
+from ..._common.datasets.lung import lung_batch
 from ..._common.datasets.pancreas import pancreas_batch
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py
@@ -1,2 +1,3 @@
 from ..._common.datasets.immune import immune_batch
+from ..._common.datasets.lung import lung_batch
 from ..._common.datasets.pancreas import pancreas_batch
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
     "numpy>=1.21,<1.24",
     "scikit-learn>=1.0,<1.2",
     "anndata==0.8.*",
-    "scprep>=1.2.1",
+    "scprep>=1.2.2",
     "scipy>=1.7,<1.10",
     "scanpy>=1.6",
     "louvain==0.8.*",

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from .immune import immune_batch`
	`2`	`+from .lung import lung_batch`
`2`	`3`	`from .pancreas import pancreas_batch`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`from ..._common.datasets.immune import immune_batch`
	`2`	`+from ..._common.datasets.lung import lung_batch`
`2`	`3`	`from ..._common.datasets.pancreas import pancreas_batch`