Batch integration data (#355)

danielStrobl · github-actions[bot] · scottgigante-immunai · web-flow · commit afafab4d8a1b · 2022-05-11T16:32:32.000+02:00
* initial commit datasets batch integration

* shorten long line

* pre-commit

* keep raw counts in X

* kill pytest after 2 fails for testing

* increase swap size

* set swap size

* swap

* fix syntax

* change order of tests

* remove duplicate layer

* pre-commit

* immune cell dataloader comments

* doc

* add task dataloaders and subsampling immune

* pre-commit

* add batch integration to init py

* pre-commit

* typo

* generate empty structure for metrics/methods

* init py root

* metrics wrong folder

* fix pancreas dataloader batch

* pancreas batch column

* method stub

* stub metric

* pre-commit

* import error

* one method

* method error

* pre-commit

* remove unused

* pre-commit

* change placeholder method to combat

* pre-commit

* downstream pp

* reduce data correct import

* pre-commit

* grammar

* removed random and

* addressing comments

Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
Co-authored-by: Scott Gigante &lt;84813314+scottgigante-immunai@users.noreply.github.com&gt;
Co-authored-by: MalteDLuecken &lt;m.d.luecken@gmail.com&gt;
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -180,7 +180,7 @@ jobs:
         cd ..
 
     - name: Run tests
-      run: pytest --cov=openproblems --cov-report=term-missing:skip-covered --cov-report=xml -vv
+      run: pytest --cov=openproblems --cov-report=term-missing:skip-covered --cov-report=xml -vv --maxfail=2
 
     - name: Upload coverage
       continue-on-error: ${{ github.repository != 'openproblems-bio/openproblems' }}
diff --git a/openproblems/data/immune_cells.py b/openproblems/data/immune_cells.py
@@ -0,0 +1,45 @@
+from . import utils
+
+import os
+import scanpy as sc
+import scprep
+import tempfile
+
+URL = "https://ndownloader.figshare.com/files/25717328"
+
+
+@utils.loader
+def load_immune(test=False):
+    """Download immune human data from figshare."""
+    if test:
+        # load full data first, cached if available
+        adata = load_immune(test=False)
+
+        # Subsample immune data to two batches with 250 cells each
+        adata = adata[:, :500].copy()
+        batch1 = adata[adata.obs.batch == "Oetjen_A"][:250]
+        batch2 = adata[adata.obs.batch == "Freytag"][:250]
+        adata = batch1.concatenate(batch2)
+        # Note: could also use 200-500 HVGs rather than 200 random genes
+
+        # Ensure there are no cells or genes with 0 counts
+        utils.filter_genes_cells(adata)
+
+        return adata
+
+    else:
+        with tempfile.TemporaryDirectory() as tempdir:
+            filepath = os.path.join(tempdir, "immune.h5ad")
+            scprep.io.download.download_url(URL, filepath)
+            adata = sc.read(filepath)
+
+            # Note: anndata.X contains scran log-normalized data,
+            # so we're storing it in layers['log_scran']
+            adata.layers["log_scran"] = adata.X
+            adata.X = adata.layers["counts"]
+            del adata.layers["counts"]
+
+            # Ensure there are no cells or genes with 0 counts
+            utils.filter_genes_cells(adata)
+
+        return adata
diff --git a/openproblems/data/pancreas.py b/openproblems/data/pancreas.py
@@ -39,10 +39,10 @@ def load_pancreas(test=False):
             scprep.io.download.download_url(URL, filepath)
             adata = sc.read(filepath)
 
-            # Remove preprocessing
+            # NOTE: X contains counts that are normalized with scran
+            adata.layers["log_scran"] = adata.X
             adata.X = adata.layers["counts"]
             del adata.layers["counts"]
-
             # Ensure there are no cells or genes with 0 counts
             utils.filter_genes_cells(adata)
 
diff --git a/openproblems/tasks/__init__.py b/openproblems/tasks/__init__.py
@@ -3,3 +3,4 @@
 from . import label_projection
 from . import multimodal_data_integration
 from . import regulatory_effect_prediction
+from ._batch_integration import batch_integration_graph
diff --git a/openproblems/tasks/_batch_integration/README.md b/openproblems/tasks/_batch_integration/README.md
@@ -0,0 +1,13 @@
+# Batch integration
+
+Batch (or data) integration methods integrate datasets across batches that arise from various biological (e.g., tissue, location, individual, species) and technical (e.g., ambient RNA, lab, protocol) sources. The goal of a batch integration method is to remove unwanted batch effects in the data, while retaining biologically-meaningful variation that can help us to detect cell identities, fit cellular trajectories, or understand patterns of gene or pathway activity.
+
+Methods that integrate batches typically have one of three different types of output: a corrected feature matrix, a joint embedding across batches, and/or an integrated cell-cell similarity graph (e.g., a kNN graph). In order to define a consistent input and output for each method and metric, we have divided the batch integration task into three subtasks. These subtasks are:
+
+* [Batch integration graphs](batch_integration_graph/),
+* [Batch integration embeddings](batch_integration_embed/), and
+* [Batch integrated feature matrices]()
+
+These subtasks collate methods that have the same data output type and metrics that evaluate this output. As corrected feature matrices can be turned into embeddings, which in turn can be processed into integrated graphs, methods overlap between the tasks. All methods are added to the graph subtask and imported into other subtasks from there. Information on the task API for datasets, methods, and metrics can be found in the individual subtask pages.
+
+Metrics for this task can be divided into those that assess the removal of batch effects, and assessments of the conservation of biological variation. This can be a helpful distinction when devising new metrics. This task, including the subtask structure, was taken from a [benchmarking study of data integration methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2). This is a useful reference for more background reading on the task and the above concepts.
diff --git a/openproblems/tasks/_batch_integration/__init__.py b/openproblems/tasks/_batch_integration/__init__.py
@@ -0,0 +1,5 @@
+from ... import utils
+
+# from . import datasets, methods, metrics, checks
+
+_task_name = "Batch integration"
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/README.md b/openproblems/tasks/_batch_integration/batch_integration_graph/README.md
@@ -0,0 +1,47 @@
+<!--- TODO: add links --->
+
+# Batch integration graph
+
+This is a sub-task of the overall batch integration task. Batch (or data) integration methods integrate datasets across batches that arise from various biological and technical sources. Methods that integrate batches typically have three different types of output: a corrected feature matrix, a joint embedding across batches, and/or an integrated cell-cell similarity graph (e.g., a kNN graph). This sub-task focuses on all methods that can output integrated graphs, and includes methods that canonically output the other two data formats with subsequent postprocessing to generate a graph. Other sub-tasks for batch integration can be found for:
+
+* [embeddings](../batch_integration_embed/), and
+* [corrected features]()
+
+This sub-task was taken from a [benchmarking study of data integration methods](https://www.biorxiv.org/content/10.1101/2020.05.22.111161v2).
+
+
+## API
+
+Datasets should contain the following attributes:
+
+* `adata.obs["batch"]` with the batch covariate,
+* `adata.obs["label"]` with the cell identity label,
+* `adata.layers['counts']` with raw, integer UMI count data, and
+* `adata.obsm['X_uni']` with the PCA embedding of the unintegrated representation
+* `adata.obsp['uni_connectivities']` with an unintegrated connectivity matrix generated
+  by  `scanpy.pp.neighbors()`
+* `adata.X` with log-normalized data
+
+Methods can take anything from datasets as input and should assign output to:
+* `adata.obsp['connectivities']` and `adata.obsp['distances']`, or
+* `adata.uns['neighbors']['connectivities']` and  `adata.uns['neighbors']['distances']`.
+
+Please note, that most methods do not use cell type labels, which improves their usability.
+
+The `openproblems-python-batch-integration` docker container is used for the methods that
+can be installed without package conflicts. (NOTE: add additional containers here)
+For R methods, the `openproblems-r-extras`
+container is used.
+
+Methods are run in four different scenarios that include scaling and highly variable gene selection:
+* `full_unscaled`
+* `hvg_unscaled`
+* `full_scaled`
+* `hvg_scaled`
+
+Functions for scaling and highly variable gene selection per batch are reused from [`scib`](https://github.com/theislab/scib). Additionally, method wrappers are reused from scIB where possible.
+
+Metrics can compare:
+* `adata.obsp['connectivities']` to `adata.obs['uni_connectivies']`,
+* `adata.obsp['connectivities']` to `adata.obs['label']`, and/or
+* `adata.obsp['connectivities']` to `adata.obs['batch']`.
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/__init__.py
@@ -0,0 +1,11 @@
+from .... import utils
+from . import api
+from . import datasets
+from . import methods
+from . import metrics
+
+_task_name = "Batch integration graph"
+
+DATASETS = utils.get_callable_members(datasets)
+METHODS = utils.get_callable_members(methods)
+METRICS = utils.get_callable_members(metrics)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/api.py b/openproblems/tasks/_batch_integration/batch_integration_graph/api.py
@@ -0,0 +1,35 @@
+from .datasets.immune import immune_batch
+
+
+def check_dataset(adata):
+    """Check that dataset output fits expected API."""
+
+    assert "X_uni" in adata.obsm
+    assert "batch" in adata.obs
+    assert "labels" in adata.obs
+    assert "uni_connectivities" in adata.obsp
+
+    return True
+
+
+def check_method(adata):
+    """Check that method output fits expected API."""
+    assert "connectivities" in adata.obsp
+    assert "distances" in adata.obsp
+    return True
+
+
+def sample_dataset():
+    """Create a simple dataset to use for testing methods in this task."""
+    adata = immune_batch(True)
+    # print(adata.obs.columns)
+
+    return adata
+
+
+def sample_method(adata):
+    """Create sample method output for testing metrics in this task."""
+    import scanpy as sc
+
+    sc.pp.neighbors(adata, use_rep="X_uni")
+    return adata
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/__init__.py
@@ -0,0 +1,2 @@
+from .immune import immune_batch
+from .pancreas import pancreas_batch
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/immune.py
@@ -0,0 +1,26 @@
+from .....data.immune_cells import load_immune
+from .....tools.decorators import dataset
+
+import scanpy as sc
+
+
+@dataset(dataset_name="Immune (by batch)", image="openproblems")
+def immune_batch(test=False):
+    adata = load_immune(test)
+    from_cache = adata.__from_cache__
+    adata.obs["labels"] = adata.obs["final_annotation"]
+
+    sc.pp.filter_genes(adata, min_counts=1)
+    sc.pp.filter_genes(adata, min_cells=1)
+
+    sc.tl.pca(
+        adata,
+        svd_solver="arpack",
+        return_info=True,
+    )
+    adata.obsm["X_uni"] = adata.obsm["X_pca"]
+
+    sc.pp.neighbors(adata, use_rep="X_uni", key_added="uni")
+
+    adata.__from_cache__ = from_cache
+    return adata
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py b/openproblems/tasks/_batch_integration/batch_integration_graph/datasets/pancreas.py
@@ -0,0 +1,27 @@
+from .....data.pancreas import load_pancreas
+from .....tools.decorators import dataset
+
+import scanpy as sc
+
+
+@dataset(dataset_name="Pancreas (by batch)", image="openproblems")
+def pancreas_batch(test=False):
+    adata = load_pancreas(test)
+    from_cache = adata.__from_cache__
+    adata.obs["labels"] = adata.obs["celltype"]
+    adata.obs["batch"] = adata.obs["tech"]
+
+    sc.pp.filter_genes(adata, min_counts=1)
+    sc.pp.filter_genes(adata, min_cells=1)
+
+    sc.tl.pca(
+        adata,
+        svd_solver="arpack",
+        return_info=True,
+    )
+    adata.obsm["X_uni"] = adata.obsm["X_pca"]
+
+    sc.pp.neighbors(adata, use_rep="X_uni", key_added="uni")
+
+    adata.__from_cache__ = from_cache
+    return adata
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/__init__.py
@@ -0,0 +1 @@
+from .bbknn import bbknn_full_unscaled
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py b/openproblems/tasks/_batch_integration/batch_integration_graph/methods/bbknn.py
@@ -0,0 +1,22 @@
+# from ....tools.normalize import log_cpm
+from .....tools.decorators import method
+from .....tools.utils import check_version
+
+
+@method(
+    method_name="BBKNN",
+    paper_name="BBKNN: fast batch alignment of single cell transcriptomes",
+    paper_url="https://academic.oup.com/bioinformatics/article/36/3/964/5545955",
+    paper_year=2020,
+    code_url="https://github.com/Teichlab/bbknn",
+    code_version=check_version("bbknn"),
+    image="openproblems-python-batch-integration",  # only if required
+)
+def bbknn_full_unscaled(adata, test=False):
+    from scib.integration import combat
+    from scib.preprocessing import reduce_data
+
+    adata = combat(adata, "batch")
+    reduce_data(adata, umap=False)
+    # Complete the result in-place
+    return adata
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/__init__.py
@@ -0,0 +1 @@
+from .ari import ari
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/ari.py
@@ -0,0 +1,32 @@
+from .....tools.decorators import metric
+
+"""
+The Rand index compares the overlap of two clusterings;
+it considers both correct clustering overlaps while also counting correct
+disagreements between two clusterings.
+Similar to NMI, we compared the cell-type labels with the NMI-optimized
+Louvain clustering computed on the integrated dataset.
+The adjustment of the Rand index corrects for randomly correct labels.
+An ARI of 0 or 1 corresponds to random labeling or a perfect match, respectively.
+We also used the scikit-learn (v.0.22.1) implementation of the ARI.
+"""
+
+
+@metric(
+    metric_name="ARI",
+    maximize=True,
+    image="openproblems-python-batch-integration",  # only if required
+)
+def ari(adata):
+    from scib.metrics import ari
+    from scib.metrics.clustering import opt_louvain
+
+    opt_louvain(
+        adata,
+        label_key="labels",
+        cluster_key="cluster",
+        plot=False,
+        inplace=True,
+        force=True,
+    )
+    return ari(adata, group1="cluster", group2="labels")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .immune import immune_batch`
	`2`	`+from .pancreas import pancreas_batch`