Skip to content

Commit b3456fd

Browse files
Add method and metric descriptions (#810)
* add method and metric summaries * Update auprc.py
1 parent 0f3736b commit b3456fd

92 files changed

Lines changed: 862 additions & 449 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

main.bib

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ @article{agrawal2021mde
2222
doi = {10.1561/2200000090},
2323
url = {https://doi.org/10.1561/2200000090},
2424
}
25+
@article{aliee2021autogenes,
26+
title = {{AutoGeneS}: Automatic gene selection using multi-objective optimization for {RNA}-seq deconvolution},
27+
author = {Hananeh Aliee and Fabian J. Theis},
28+
year = {2021},
29+
month = jul,
30+
journal = {Cell Systems},
31+
publisher = {Elsevier {BV}},
32+
volume = {12},
33+
number = {7},
34+
pages = {706--715.e4},
35+
doi = {10.1016/j.cels.2021.05.006},
36+
url = {https://doi.org/10.1016/j.cels.2021.05.006},
37+
}
2538
@article{andersson2020single,
2639
title = {Single-cell and spatial transcriptomics enables probabilistic inference of cell type topography},
2740
author = {Alma Andersson and Joseph Bergenstr{\aa}hle and Michaela Asp and Ludvig Bergenstr{\aa}hle and Aleksandra Jurek and Jos{\'{e}} Fern{\'{a}}ndez Navarro and Joakim Lundeberg},

openproblems/tasks/_batch_integration/_common/methods/baseline.py

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
from .....tools.decorators import method
1+
from .....tools.decorators import baseline_method
22
from .....tools.utils import check_version
33

4-
import functools
54
import numpy as np
65

76

@@ -47,18 +46,12 @@ def _random_embedding(partition, jitter=0.01):
4746
return embedding
4847

4948

50-
_baseline_method = functools.partial(
51-
method,
52-
paper_name="Open Problems for Single Cell Analysis",
53-
paper_reference="openproblems",
54-
paper_year=2022,
55-
code_url="https://github.com/openproblems-bio/openproblems",
56-
is_baseline=True,
57-
)
58-
59-
60-
@_baseline_method(
49+
@baseline_method(
6150
method_name="No Integration",
51+
method_summary=(
52+
"Cells are embedded by PCA on the unintegrated data. A graph is built on this"
53+
" PCA embedding."
54+
),
6255
)
6356
def no_integration(adata, test=False):
6457
adata.obsp["connectivities"] = adata.obsp["uni_connectivities"]
@@ -69,8 +62,12 @@ def no_integration(adata, test=False):
6962
return adata
7063

7164

72-
@_baseline_method(
65+
@baseline_method(
7366
method_name="Random Integration",
67+
method_summary=(
68+
"Feature values, embedding coordinates, and graph connectivity are all randomly"
69+
" permuted"
70+
),
7471
)
7572
def random_integration(adata, test=False):
7673
adata.X = _randomize_features(adata.X)
@@ -80,13 +77,12 @@ def random_integration(adata, test=False):
8077
return adata
8178

8279

83-
@_baseline_method(
80+
@baseline_method(
8481
method_name="Random Integration by Celltype",
85-
paper_name="Random Integration by Celltype (baseline)",
86-
paper_reference="openproblems",
87-
paper_year=2022,
88-
code_url="https://github.com/openproblems-bio/openproblems",
89-
is_baseline=True,
82+
method_summary=(
83+
"Feature values, embedding coordinates, and graph connectivity are all randomly"
84+
" permuted within each celltype label"
85+
),
9086
)
9187
def celltype_random_integration(adata, test=False):
9288
adata.obsm["X_emb"] = _randomize_features(
@@ -101,8 +97,12 @@ def celltype_random_integration(adata, test=False):
10197
return adata
10298

10399

104-
@_baseline_method(
100+
@baseline_method(
105101
method_name="Random Integration by Batch",
102+
method_summary=(
103+
"Feature values, embedding coordinates, and graph connectivity are all randomly"
104+
" permuted within each batch label"
105+
),
106106
)
107107
def batch_random_integration(adata, test=False):
108108
adata.obsm["X_emb"] = _randomize_features(

openproblems/tasks/_batch_integration/batch_integration_embed/README.md

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -20,37 +20,6 @@ This sub-task was taken from a
2020
[benchmarking study of data integration
2121
methods](https://openproblems.bio/bibliography#luecken2022benchmarking).
2222

23-
## The metrics
24-
25-
Metrics for batch integration (embed) measure how well batches are mixed while
26-
biological signals are preserved. They are divided into batch correction and biological
27-
variance conservation metrics.
28-
29-
### Batch correction
30-
31-
* **kBET**: kBET determines whether the label composition of a k nearest neighborhood of
32-
a cell is similar to the expected (global) label composition
33-
([Buettner et al., Nat Meth 2019](https://openproblems.bio/bibliography#bttner2018test)).
34-
The test is repeated for a random subset of cells,
35-
and the results are summarized as a rejection rate over all tested neighborhoods.
36-
* **Silhouette batch score**: The absolute silhouette width is computed over batch
37-
labels per cell. As 0 then indicates that batches are well mixed and any deviation from
38-
0 indicates a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1].
39-
* **Principal component regression (PC regression)**: This compare the explained
40-
variance by batch before and after integration. It returns a score between 0 and 1
41-
(scaled=True) with 0 if the variance contribution hasn’t changed. The larger the score,
42-
the more different the variance contributions are before and after integration.
43-
44-
### Biological variance conservation
45-
46-
* **Cell cycle score**: The cell-cycle conservation score evaluates how well the
47-
cell-cycle effect can be captured before and after integration.
48-
* **Isolated label silhouette**: This score evaluates the compactness for the label(s)
49-
that is(are) shared by fewest batches. It indicates how well rare cell types can be
50-
preserved after integration.
51-
* **Cell type ASW**: The absolute silhouette with is computed on cell identity labels,
52-
measuring their compactness.
53-
5423
## API
5524

5625
WARNING: other than most tasks, `adata.X` should contain log-normalized data.

openproblems/tasks/_batch_integration/batch_integration_embed/methods/baseline.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,37 @@
1-
from .....tools.decorators import method
1+
from .....tools.decorators import baseline_method
22
from .....tools.utils import check_version
33
from ..._common.methods.baseline import _random_embedding
44

5-
import functools
65
import numpy as np
76
import scanpy as sc
87

9-
_baseline_method = functools.partial(
10-
method,
11-
paper_name="Open Problems for Single Cell Analysis",
12-
paper_reference="openproblems",
13-
paper_year=2022,
14-
code_url="https://github.com/openproblems-bio/openproblems",
15-
is_baseline=True,
16-
)
17-
188

19-
@_baseline_method(
9+
@baseline_method(
2010
method_name="Random Embedding by Celltype (with jitter)",
11+
method_summary=(
12+
"Cells are embedded as a one-hot encoding of celltype labels, with a small"
13+
" amount of random noise added to the embedding"
14+
),
2115
)
2216
def celltype_random_embedding_jitter(adata, test=False):
2317
adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=0.01)
2418
adata.uns["method_code_version"] = check_version("openproblems")
2519
return adata
2620

2721

28-
@_baseline_method(
22+
@baseline_method(
2923
method_name="Random Embedding by Celltype",
24+
method_summary="Cells are embedded as a one-hot encoding of celltype labels",
3025
)
3126
def celltype_random_embedding(adata, test=False):
3227
adata.obsm["X_emb"] = _random_embedding(partition=adata.obs["labels"], jitter=None)
3328
adata.uns["method_code_version"] = check_version("openproblems")
3429
return adata
3530

3631

37-
@_baseline_method(
32+
@baseline_method(
3833
method_name="No Integration by Batch",
34+
method_summary="Cells are embedded by computing PCA independently on each batch",
3935
)
4036
def no_integration_batch(adata, test=False):
4137
"""Compute PCA independently on each batch

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/cc_score.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121

2222
@metric(
2323
metric_name="Cell Cycle Score",
24+
metric_summary=(
25+
"The cell-cycle conservation score evaluates how well the cell-cycle effect can"
26+
" be captured before and after integration."
27+
),
2428
paper_reference="luecken2022benchmarking",
2529
maximize=True,
2630
image="openproblems-r-pytorch",

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/iso_label_sil.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,11 @@
1414

1515
@metric(
1616
metric_name="Isolated label Silhouette",
17+
metric_summary=(
18+
"This score evaluates the compactness for the label(s) that is(are) shared by"
19+
" fewest batches. It indicates how well rare cell types can be preserved after"
20+
" integration."
21+
),
1722
paper_reference="luecken2022benchmarking",
1823
maximize=True,
1924
image="openproblems-r-pytorch",

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/kBET.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626

2727
@metric(
2828
metric_name="kBET",
29+
metric_summary=(
30+
"kBET determines whether the label composition of a k nearest neighborhood of a"
31+
" cell is similar to the expected (global) label composition. The test is"
32+
" repeated for a random subset of cells, and the results are summarized as a"
33+
" rejection rate over all tested neighborhoods."
34+
),
2935
paper_reference="bttner2018test",
3036
maximize=True,
3137
image="openproblems-r-extras",

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/pcr.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818

1919
@metric(
2020
metric_name="PC Regression",
21+
metric_summary=(
22+
"This compares the explained variance by batch before and after integration. It"
23+
" returns a score between 0 and 1 (scaled=True) with 0 if the variance"
24+
" contribution hasn’t changed. The larger the score, the more different the"
25+
" variance contributions are before and after integration."
26+
),
2127
paper_reference="luecken2022benchmarking",
2228
maximize=True,
2329
image="openproblems-r-pytorch",

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/sil_batch.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323

2424
@metric(
2525
metric_name="Batch ASW",
26+
metric_summary=(
27+
"The absolute silhouette width is computed over batch labels per cell. As 0"
28+
" then indicates that batches are well mixed and any deviation from 0 indicates"
29+
" a batch effect, we use the 1-abs(ASW) to map the score to the scale [0;1]."
30+
),
2631
paper_reference="luecken2022benchmarking",
2732
maximize=True,
2833
image="openproblems-r-pytorch",

openproblems/tasks/_batch_integration/batch_integration_embed/metrics/silhouette.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111

1212
@metric(
1313
metric_name="Silhouette",
14+
metric_summary=(
15+
"The absolute silhouette with is computed on cell identity labels, measuring"
16+
" their compactness."
17+
),
1418
paper_reference="luecken2022benchmarking",
1519
maximize=True,
1620
image="openproblems-r-pytorch",

0 commit comments

Comments
 (0)