Skip to content

Commit fb6ecca

Browse files
committed
Cross validation by method refactor, statistcal test moved in and old LTOO excluded for now
1 parent b850742 commit fb6ecca

1 file changed

Lines changed: 100 additions & 197 deletions

File tree

src/diffupath/cross_validation.py

Lines changed: 100 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -1,209 +1,29 @@
11
# -*- coding: utf-8 -*-
22

33
"""Cross-validation utilities."""
4-
4+
import itertools
5+
import math
56
from collections import defaultdict
7+
from typing import Union, Tuple
68

79
import numpy as np
810
from diffupy.diffuse_raw import diffuse_raw
11+
from diffupy.kernels import regularised_laplacian_kernel
912
from diffupy.matrix import Matrix
10-
from diffupy.process_input import generate_categoric_input_from_labels
13+
from diffupy.process_input import format_input_for_diffusion, process_input_data
14+
from pybel import get_subgraph_by_annotation_value
15+
from scipy import stats
1116
from sklearn import metrics
17+
from statsmodels.stats.multitest import fdrcorrection
1218
from tqdm import tqdm
1319

14-
from typing import Union, Tuple
15-
1620
from .topological_analyses import generate_pagerank_baseline
17-
from .utils import random_disjoint_intersection_three_subsets, hide_true_positives, split_random_two_subsets
18-
19-
"""Random cross validation_datasets"""
20-
21-
22-
def get_random_cv_split_input_and_validation(input: Union[list, set],
23-
background_mat: Matrix
24-
) -> Tuple[Matrix, Matrix]:
25-
"""Get random CV split."""
26-
input_labels, validation_labels = split_random_two_subsets(input)
27-
28-
return (
29-
generate_categoric_input_from_labels(
30-
input_labels,
31-
'label_input with hidden true positives',
32-
background_mat
33-
),
34-
generate_categoric_input_from_labels(
35-
validation_labels,
36-
'original label_input labels',
37-
background_mat
38-
)
39-
)
40-
41-
42-
def get_random_cv_inputs_from_subsets_same_diff_input(input_subsets: Union[list, set],
43-
background_mat
44-
):
45-
"""Get random CV label_input from subsets with different label_input."""
46-
input_labels = set()
47-
input_unlabeled = set()
48-
49-
validation_mats_by_entity_type = defaultdict()
50-
51-
for entity_type, input in input_subsets.items():
52-
hidden_input = hide_true_positives(input[0])
53-
validation_mats_by_entity_type[entity_type] = generate_categoric_input_from_labels(
54-
input[0],
55-
'Dataset 1 ' + str(
56-
entity_type),
57-
background_mat
58-
)
59-
input_unlabeled.update(set(input[0]))
60-
input_labels.update(set(hidden_input))
61-
62-
input_mat = generate_categoric_input_from_labels(input_labels, 'Dataset1', background_mat, input_unlabeled)
63-
64-
return input_mat, validation_mats_by_entity_type
65-
66-
67-
# Partial cross validation_datasets
68-
69-
def get_one_x_in_cv_inputs_from_subsets(
70-
input_subsets,
71-
background_mat,
72-
one_in='Reactome',
73-
rows_unlabeled=False,
74-
missing_value=-1
75-
):
76-
"""Get one cross label_input from subsets."""
77-
input_dict = {}
78-
input_labels = input_subsets.pop(one_in)
79-
rows_unlabel = None
80-
81-
for labels_type, validation_labels in input_subsets.items():
82-
if rows_unlabeled:
83-
rows_unlabel = validation_labels
84-
missing_value = -1
85-
86-
input_dict[labels_type] = (
87-
generate_categoric_input_from_labels(
88-
input_labels,
89-
'two out label_input',
90-
background_mat,
91-
missing_value,
92-
rows_unlabeled=rows_unlabel
93-
),
94-
generate_categoric_input_from_labels(
95-
validation_labels,
96-
'two out label_input',
97-
background_mat,
98-
missing_value,
99-
)
100-
)
101-
return input_dict
102-
103-
104-
def get_metrics(validation_labels,
105-
scores
106-
):
107-
"""Return metrics."""
108-
validation_labels_vec = validation_labels.__copy__()
109-
110-
return metrics.roc_auc_score(validation_labels.mat, scores.mat), metrics.average_precision_score(
111-
validation_labels.mat, scores.mat)
112-
113-
114-
def cross_validation_by_subset_same_diff_input(mapping_by_subsets,
115-
kernel,
116-
k=3,
117-
z=True):
118-
"""Cross validation helper."""
119-
auroc_metrics = defaultdict(list)
120-
auprc_metrics = defaultdict(list)
121-
122-
for i in tqdm(range(k)):
123-
input_mat, validation_inputs_by_subsets = get_random_cv_inputs_from_subsets_same_diff_input(
124-
mapping_by_subsets,
125-
kernel,
126-
)
127-
128-
scores = diffuse_raw(graph=None, scores=input_mat, k=kernel, z=z)
129-
130-
for entity, validation_labels in validation_inputs_by_subsets.items():
131-
auroc, auprc = get_metrics(validation_labels, scores)
132-
auroc_metrics[entity].append(auroc)
133-
auprc_metrics[entity].append(auprc)
134-
135-
return auroc_metrics, auprc_metrics
136-
137-
138-
def cross_validation_one_x_in(mapping_by_subsets,
139-
kernel,
140-
k=1,
141-
missing_value=-1,
142-
disjoint=False,
143-
rows_unlabeled=False,
144-
z=False
145-
):
146-
"""Cross validation one."""
147-
auroc_metrics = defaultdict(lambda: defaultdict(list))
148-
auprc_metrics = defaultdict(lambda: defaultdict(list))
149-
150-
scores_dict = defaultdict(lambda: defaultdict(list))
151-
validation_dict = defaultdict(lambda: defaultdict(list))
152-
input_dict = defaultdict(lambda: defaultdict(list))
153-
154-
if disjoint:
155-
mapping_by_subsets = random_disjoint_intersection_three_subsets(mapping_by_subsets)
156-
157-
for i in tqdm(range(k)):
158-
159-
for diffuse_input_type in tqdm(mapping_by_subsets):
160-
inputs = get_one_x_in_cv_inputs_from_subsets(
161-
dict(mapping_by_subsets),
162-
kernel,
163-
one_in=diffuse_input_type,
164-
rows_unlabeled=rows_unlabeled,
165-
missing_value=missing_value,
166-
)
167-
168-
for validation_type, validation_labels in inputs.items():
169-
input_diffuse, input_validation = validation_labels[0], validation_labels[1]
170-
171-
# Input test
172-
# validate_cross_validation_input_1(input_diffuse, input_validation, validation_input_from_dict(mapping_by_subsets, diffuse_input_type, validation_type, input_diffuse))
173-
174-
# Run diffusion
175-
scores = diffuse_raw(graph=None, scores=input_diffuse, k=kernel, z=z)
176-
177-
scores.cols_labels = ['scores']
178-
input_validation.cols_labels = ['input_validation']
179-
input_diffuse.cols_labels = ['input_diffuse']
180-
181-
auroc, auprc = get_metrics(input_validation, scores)
182-
183-
auroc_metrics[diffuse_input_type][validation_type].append(auroc)
184-
auprc_metrics[diffuse_input_type][validation_type].append(auprc)
185-
186-
scores.col_bind(matrix=input_validation)
187-
scores.col_bind(matrix=input_diffuse)
188-
189-
scores_dict[diffuse_input_type][validation_type].append(scores)
21+
from .utils import split_random_two_subsets
19022

191-
return dict(auroc_metrics), dict(auprc_metrics), dict(scores_dict)
23+
"""Random cross validation datasets functions"""
19224

19325

194-
# Method cross validation_datasets
195-
196-
197-
def generate_random_score_ranking(background_mat):
198-
"""Generate random scores."""
199-
return Matrix(
200-
mat=np.random.rand(len(background_mat.rows_labels)),
201-
rows_labels=background_mat.rows_labels,
202-
cols_labels=['Radom_Baseline'],
203-
)
204-
205-
206-
def cross_validation_by_method(all_labels_mapping,
26+
def cross_validation_by_method(data_input,
20727
graph,
20828
kernel,
20929
k=100
@@ -213,26 +33,26 @@ def cross_validation_by_method(all_labels_mapping,
21333
auprc_metrics = defaultdict(list)
21434

21535
for _ in tqdm(range(k)):
216-
input_diff, validation_diff = get_random_cv_split_input_and_validation(
217-
all_labels_mapping, kernel
36+
input_diff, validation_diff = _get_random_cv_split_input_and_validation(
37+
data_input, kernel
21838
)
21939

22040
scores_z = diffuse_raw(graph=None, scores=input_diff, k=kernel, z=True)
22141
scores_raw = diffuse_raw(graph=None, scores=input_diff, k=kernel, z=False)
22242
scores_page_rank = generate_pagerank_baseline(graph, kernel)
22343

224-
method_validation_inputs = {
44+
method_validation_scores = {
22545
'raw': (validation_diff,
22646
scores_raw
22747
),
22848
'z': (validation_diff,
22949
scores_z
23050
),
231-
'random_baseline': (
51+
'random': (
23252
validation_diff,
233-
generate_random_score_ranking(kernel)
53+
_generate_random_score_ranking(kernel)
23454
),
235-
'page_rank_baseline': (
55+
'page_rank': (
23656
validation_diff,
23757
scores_page_rank
23858
),
@@ -245,3 +65,86 @@ def cross_validation_by_method(all_labels_mapping,
24565
auprc_metrics[method].append(auprc)
24666

24767
return auroc_metrics, auprc_metrics
68+
69+
70+
"""Helper functions for random cross-validation"""
71+
72+
73+
def _generate_random_score_ranking(background_mat):
74+
"""Generate random scores."""
75+
return Matrix(
76+
mat=np.random.rand(len(background_mat.rows_labels)),
77+
rows_labels=background_mat.rows_labels,
78+
cols_labels=['Radom_Baseline'],
79+
)
80+
81+
82+
def _get_random_cv_split_input_and_validation(input: Union[list, set],
83+
background_mat: Matrix
84+
) -> Tuple[Matrix, Matrix]:
85+
"""Get random CV split."""
86+
input_labels, validation_labels = split_random_two_subsets(input)
87+
88+
if isinstance(validation_labels, dict):
89+
validation_labels = process_input_data(validation_labels, binning=True, threshold=0.5)
90+
91+
return (
92+
format_input_for_diffusion(
93+
input_labels,
94+
background_mat,
95+
title='label_input with hidden true positives'
96+
),
97+
format_input_for_diffusion(
98+
validation_labels,
99+
background_mat,
100+
title='original label_input labels'
101+
)
102+
)
103+
104+
105+
def _get_metrics(validation_labels,
106+
scores
107+
):
108+
"""Return metrics."""
109+
return metrics.roc_auc_score(validation_labels.mat, scores.mat), metrics.average_precision_score(
110+
validation_labels.mat, scores.mat)
111+
112+
113+
"""Statistical test"""
114+
115+
116+
def get_p_values(metrics):
117+
p_values = {}
118+
119+
result_list = map(dict, itertools.combinations(metrics.items(), 2))
120+
121+
for ch in result_list:
122+
values = list(ch.values())
123+
ttest = stats.ttest_rel(a=values[0], b=values[1])
124+
p_values[str(tuple(ch.keys()))] = ttest.pvalue
125+
126+
return p_values
127+
128+
129+
def get_p_values_multiple(metrics):
130+
p_values = {}
131+
132+
for k, v in metrics[0].items():
133+
ttest = stats.ttest_rel(a=v, b=metrics[1][k])
134+
p_values[str(k)] = ttest.pvalue
135+
136+
return p_values
137+
138+
139+
def get_normalized_p_values(p_values):
140+
normalized_p_values = {}
141+
142+
fdr = fdrcorrection(list(p_values.values()),
143+
alpha=0.05,
144+
method='indep',
145+
is_sorted=False)
146+
147+
for k1, k2 in enumerate(p_values.keys()):
148+
normalized_p_values[k2] = -math.log10(fdr[1][k1])
149+
150+
return normalized_p_values

0 commit comments

Comments
 (0)