Use graph and embedding metrics for feature and embedding subtask (#807)

danielStrobl · github-actions[bot] · scottgigante-immunai · web-flow · commit 3d3eb22f9ddc · 2023-02-02T06:02:22.000+11:00
* wrappers for output generation * pre-commit * add pca to sample feature task dataset * pre-commit * Update api.py * bugfixes * pre-commit * flake8 import * pre-commit * test other syntax * pre-commit * disable flake8 for long import * pre-commit * added whitespace * pre-commit * Address flake8 * pre-commit * address flake8 * pre-commit * flake8 * Fix syntax * pre-commit * pre-commit * graph conn flake8 * pre-commit * clean up gitignore * refactor for readability * require uncorrected PCA for feature task * pre-commit --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Scott Gigante <84813314+scottgigante-immunai@users.noreply.github.com> Co-authored-by: Scott Gigante <scott.gigante@immunai.com> Former-commit-id: fe18dfb
diff --git a/.gitignore b/.gitignore
@@ -146,14 +146,12 @@ nf-openproblems
 
 # Editor
 .idea
+.vscode
 
 scratch/
 openproblems/results/
 openproblems/work/
 batch_embed.txt
-immune.h5ad
+*.h5ad
 
-immune.h5ad
-batch_embed.txt
-.vscode/launch.json
 run_bbknn.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/__init__.py
@@ -1,6 +1,10 @@
+from .ari import ari
 from .cc_score import cc_score
+from .graph_connectivity import graph_connectivity
+from .iso_label_f1 import isolated_labels_f1
 from .iso_label_sil import isolated_labels_sil
 from .kBET import kBET
+from .nmi import nmi
 from .pcr import pcr
 from .sil_batch import silhouette_batch
 from .silhouette import silhouette
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/ari.py
@@ -0,0 +1,27 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+The Rand index compares the overlap of two clusterings;
+it considers both correct clustering overlaps while also counting correct
+disagreements between two clusterings.
+Similar to NMI, we compared the cell-type labels with the NMI-optimized
+Louvain clustering computed on the integrated dataset.
+The adjustment of the Rand index corrects for randomly correct labels.
+An ARI of 0 or 1 corresponds to random labeling or a perfect match,
+respectively.
+We also used the scikit-learn (v.0.22.1) implementation of the ARI.
+"""
+
+
+@metric(
+    metric_name="ARI",
+    maximize=True,
+    paper_reference="luecken2022benchmarking",
+    image="openproblems-r-pytorch",
+)
+def ari(adata):
+    from scanpy.pp import neighbors
+
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.ari(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/graph_connectivity.py
@@ -0,0 +1,33 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+The graph connectivity metric assesses whether the kNN graph representation,
+G, of the integrated data directly connects all cells with the same cell
+identity label. For each cell identity label c, we created the subset kNN
+graph G(Nc;Ec) to contain only cells from a given label. Using these subset
+kNN graphs, we computed the graph connectivity score using the equation:
+
+gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|.
+
+Here, C represents the set of cell identity labels, |LCC()| is the number
+of nodes in the largest connected component of the graph, and |Nc| is the
+number of nodes with cell identity c. The resultant score has a range
+of (0;1], where 1 indicates that all cells with the same cell identity
+are connected in the integrated kNN graph, and the lowest possible score
+indicates a graph where no cell is connected. As this score is computed
+on the kNN graph, it can be used to evaluate all integration outputs.
+"""
+
+
+@metric(
+    metric_name="Graph connectivity",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def graph_connectivity(adata):
+    from scanpy.pp import neighbors
+
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.graph_connectivity(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_f1.py
@@ -0,0 +1,38 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+We developed two isolated label scores to evaluate how well the data integration methods
+dealt with cell identity labels shared by few batches. Specifically, we identified
+isolated cell labels as the labels present in the least number of batches in the
+integration task.
+The score evaluates how well these isolated labels separate from other cell identities.
+We implemented the isolated label metric in two versions:
+(1) the best clustering of the isolated label (F1 score) and
+(2) the global ASW of the isolated label. For the cluster-based score,
+we first optimize the cluster assignment of the isolated label using the F1 score
+across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1.
+The optimal F1 score for the isolated label is then used as the metric score.
+The F1 score is a weighted mean of precision and recall given by the equation:
+𝐹1=2×(precision×recall)/(precision+recall).
+
+It returns a value between 0 and 1,
+where 1 shows that all of the isolated label cells and no others are captured in
+the cluster. For the isolated label ASW score, we compute the ASW of isolated
+versus nonisolated labels on the PCA embedding (ASW metric above) and scale this
+score to be between 0 and 1. The final score for each metric version consists of
+the mean isolated score of all isolated labels.
+"""
+
+
+@metric(
+    metric_name="Isolated label F1",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def isolated_labels_f1(adata):
+    from scanpy.pp import neighbors
+
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.isolated_labels_f1(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_embed/metrics/nmi.py
@@ -0,0 +1,26 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""NMI compares the overlap of two clusterings.
+We used NMI to compare the cell-type labels with Louvain clusters computed on
+the integrated dataset. The overlap was scaled using the mean of the entropy terms
+for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated
+clustering or a perfect match, respectively. We performed optimized Louvain clustering
+for this metric to obtain the best match between clusters and labels.
+Louvain clustering was performed at a resolution range of 0.1 to 2 in steps of 0.1,
+and the clustering output with the highest NMI with the label set was used. We used
+the scikit-learn27 (v.0.22.1) implementation of NMI.
+"""
+
+
+@metric(
+    metric_name="NMI",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def nmi(adata):
+    from scanpy.pp import neighbors
+
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.nmi(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/README.md b/openproblems/tasks/_batch_integration/batch_integration_feature/README.md
@@ -42,6 +42,7 @@ Datasets should contain the following attributes:
 
 * `adata.obs["batch"]` with the batch covariate, and
 * `adata.obs["label"]` with the cell identity label
+* `adata.obs["X_uni_pca"]` with a PCA embedding of the uncorrected data
 * `adata.layers['counts']` with raw, integer UMI count data,
 * `adata.layers['log_normalized']` with log-normalized data and
 * `adata.X` with log-normalized data
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/api.py b/openproblems/tasks/_batch_integration/batch_integration_feature/api.py
@@ -1,6 +1,9 @@
+from ....tools.decorators import dataset
 from .._common import api
 
-check_dataset = api.check_dataset
+import functools
+
+check_dataset = functools.partial(api.check_dataset, do_check_pca=True)
 
 
 def check_method(adata, is_baseline=False):
@@ -11,7 +14,9 @@ def check_method(adata, is_baseline=False):
     return True
 
 
-sample_dataset = api.sample_dataset
+@dataset()
+def sample_dataset():
+    return api.sample_dataset(run_pca=True)
 
 
 def sample_method(adata):
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/__init__.py
@@ -1 +1,11 @@
+from .ari import ari
+from .cc_score import cc_score
+from .graph_connectivity import graph_connectivity
 from .hvg_conservation import hvg_conservation
+from .iso_label_f1 import isolated_labels_f1
+from .iso_label_sil import isolated_labels_sil
+from .kBET import kBET
+from .nmi import nmi
+from .pcr import pcr
+from .sil_batch import silhouette_batch
+from .silhouette import silhouette
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/ari.py
@@ -0,0 +1,29 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+The Rand index compares the overlap of two clusterings;
+it considers both correct clustering overlaps while also counting correct
+disagreements between two clusterings.
+Similar to NMI, we compared the cell-type labels with the NMI-optimized
+Louvain clustering computed on the integrated dataset.
+The adjustment of the Rand index corrects for randomly correct labels.
+An ARI of 0 or 1 corresponds to random labeling or a perfect match,
+respectively.
+We also used the scikit-learn (v.0.22.1) implementation of the ARI.
+"""
+
+
+@metric(
+    metric_name="ARI",
+    maximize=True,
+    paper_reference="luecken2022benchmarking",
+    image="openproblems-r-pytorch",
+)
+def ari(adata):
+    from scanpy.pp import neighbors
+    from scanpy.tl import pca
+
+    adata.obsm["X_emb"] = pca(adata.X)
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.ari(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/cc_score.py
@@ -0,0 +1,32 @@
+from .....tools.decorators import metric
+from ...batch_integration_embed import metrics as embed_metrics
+
+"""
+The cell-cycle conservation score evaluates how well the cell-cycle effect can be
+captured before and after integration. We computed cell-cycle scores using Scanpy’s
+score_cell_cycle function with a reference gene set from Tirosh et al for the
+respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and
+human data (using capitalization to convert between the gene symbols). We then computed
+the variance contribution of the resulting S and G2/M phase scores using principal
+component regression (Principal component regression), which was performed for each
+batch separately. The differences in variance before, Varbefore, and after, Varafter,
+integration were aggregated into a final score between 0 and 1, using the equation:
+CCconservation=1−|Varafter−Varbefore|/Varbefore.
+
+In this equation, values close to 0 indicate lower conservation and 1 indicates complete
+conservation of the variance explained by cell cycle. In other words, the variance
+remains unchanged within each batch for complete conservation, while any deviation from
+the preintegration variance contribution reduces the score."""
+
+
+@metric(
+    metric_name="Cell Cycle Score",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def cc_score(adata, test=False):
+    from scanpy.tl import pca
+
+    adata.obsm["X_emb"] = pca(adata.X)
+    return embed_metrics.cc_score(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/graph_connectivity.py
@@ -0,0 +1,35 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+The graph connectivity metric assesses whether the kNN graph representation,
+G, of the integrated data directly connects all cells with the same cell
+identity label. For each cell identity label c, we created the subset kNN
+graph G(Nc;Ec) to contain only cells from a given label. Using these subset
+kNN graphs, we computed the graph connectivity score using the equation:
+
+gc =1/|C| Σc∈C |LCC(G(Nc;Ec))|/|Nc|.
+
+Here, C represents the set of cell identity labels, |LCC()| is the number
+of nodes in the largest connected component of the graph, and |Nc| is the
+number of nodes with cell identity c. The resultant score has a range
+of (0;1], where 1 indicates that all cells with the same cell identity
+are connected in the integrated kNN graph, and the lowest possible score
+indicates a graph where no cell is connected. As this score is computed
+on the kNN graph, it can be used to evaluate all integration outputs.
+"""
+
+
+@metric(
+    metric_name="Graph connectivity",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def graph_connectivity(adata):
+    from scanpy.pp import neighbors
+    from scanpy.tl import pca
+
+    adata.obsm["X_emb"] = pca(adata.X)
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.graph_connectivity(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_f1.py
@@ -0,0 +1,40 @@
+from .....tools.decorators import metric
+from ...batch_integration_graph import metrics as graph_metrics
+
+"""
+We developed two isolated label scores to evaluate how well the data integration methods
+dealt with cell identity labels shared by few batches. Specifically, we identified
+isolated cell labels as the labels present in the least number of batches in the
+integration task.
+The score evaluates how well these isolated labels separate from other cell identities.
+We implemented the isolated label metric in two versions:
+(1) the best clustering of the isolated label (F1 score) and
+(2) the global ASW of the isolated label. For the cluster-based score,
+we first optimize the cluster assignment of the isolated label using the F1 score˚
+across louvain clustering resolutions ranging from 0.1 to 2 in resolution steps of 0.1.
+The optimal F1 score for the isolated label is then used as the metric score.
+The F1 score is a weighted mean of precision and recall given by the equation:
+𝐹1=2×(precision×recall)/(precision+recall).
+
+It returns a value between 0 and 1,
+where 1 shows that all of the isolated label cells and no others are captured in
+the cluster. For the isolated label ASW score, we compute the ASW of isolated
+versus nonisolated labels on the PCA embedding (ASW metric above) and scale this
+score to be between 0 and 1. The final score for each metric version consists of
+the mean isolated score of all isolated labels.
+"""
+
+
+@metric(
+    metric_name="Isolated label F1",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def isolated_labels_f1(adata):
+    from scanpy.pp import neighbors
+    from scanpy.tl import pca
+
+    adata.obsm["X_emb"] = pca(adata.X)
+    neighbors(adata, use_rep="X_emb")
+    return graph_metrics.isolated_labels_f1(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/iso_label_sil.py
@@ -0,0 +1,26 @@
+from .....tools.decorators import metric
+from ...batch_integration_embed import metrics as embed_metrics
+
+"""
+Isolated cell labels are defined as the labels present in the least number
+of batches in the integration task. The score evaluates how well these isolated labels
+separate from other cell identities.
+
+The isolated label ASW score is obtained by computing the
+ASW of isolated versus non-isolated labels on the PCA embedding (ASW metric above) and
+scaling this score to be between 0 and 1. The final score for each metric version
+consists of the mean isolated score of all isolated labels.
+"""
+
+
+@metric(
+    metric_name="Isolated label Silhouette",
+    paper_reference="luecken2022benchmarking",
+    maximize=True,
+    image="openproblems-r-pytorch",
+)
+def isolated_labels_sil(adata):
+    from scanpy.tl import pca
+
+    adata.obsm["X_emb"] = pca(adata.X)
+    return embed_metrics.isolated_labels_sil(adata)
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/kBET.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/nmi.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/pcr.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/sil_batch.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py b/openproblems/tasks/_batch_integration/batch_integration_feature/metrics/silhouette.py
diff --git a/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py b/openproblems/tasks/_batch_integration/batch_integration_graph/metrics/iso_label_f1.py