11# -*- coding: utf-8 -*-
22
33"""Cross-validation utilities."""
4-
4+ import itertools
5+ import math
56from collections import defaultdict
7+ from typing import Union , Tuple
68
79import numpy as np
810from diffupy .diffuse_raw import diffuse_raw
11+ from diffupy .kernels import regularised_laplacian_kernel
912from diffupy .matrix import Matrix
10- from diffupy .process_input import generate_categoric_input_from_labels
13+ from diffupy .process_input import format_input_for_diffusion , process_input_data
14+ from pybel import get_subgraph_by_annotation_value
15+ from scipy import stats
1116from sklearn import metrics
17+ from statsmodels .stats .multitest import fdrcorrection
1218from tqdm import tqdm
1319
14- from typing import Union , Tuple
15-
1620from .topological_analyses import generate_pagerank_baseline
17- from .utils import random_disjoint_intersection_three_subsets , hide_true_positives , split_random_two_subsets
18-
19- """Random cross validation_datasets"""
20-
21-
22- def get_random_cv_split_input_and_validation (input : Union [list , set ],
23- background_mat : Matrix
24- ) -> Tuple [Matrix , Matrix ]:
25- """Get random CV split."""
26- input_labels , validation_labels = split_random_two_subsets (input )
27-
28- return (
29- generate_categoric_input_from_labels (
30- input_labels ,
31- 'label_input with hidden true positives' ,
32- background_mat
33- ),
34- generate_categoric_input_from_labels (
35- validation_labels ,
36- 'original label_input labels' ,
37- background_mat
38- )
39- )
40-
41-
42- def get_random_cv_inputs_from_subsets_same_diff_input (input_subsets : Union [list , set ],
43- background_mat
44- ):
45- """Get random CV label_input from subsets with different label_input."""
46- input_labels = set ()
47- input_unlabeled = set ()
48-
49- validation_mats_by_entity_type = defaultdict ()
50-
51- for entity_type , input in input_subsets .items ():
52- hidden_input = hide_true_positives (input [0 ])
53- validation_mats_by_entity_type [entity_type ] = generate_categoric_input_from_labels (
54- input [0 ],
55- 'Dataset 1 ' + str (
56- entity_type ),
57- background_mat
58- )
59- input_unlabeled .update (set (input [0 ]))
60- input_labels .update (set (hidden_input ))
61-
62- input_mat = generate_categoric_input_from_labels (input_labels , 'Dataset1' , background_mat , input_unlabeled )
63-
64- return input_mat , validation_mats_by_entity_type
65-
66-
67- # Partial cross validation_datasets
68-
69- def get_one_x_in_cv_inputs_from_subsets (
70- input_subsets ,
71- background_mat ,
72- one_in = 'Reactome' ,
73- rows_unlabeled = False ,
74- missing_value = - 1
75- ):
76- """Get one cross label_input from subsets."""
77- input_dict = {}
78- input_labels = input_subsets .pop (one_in )
79- rows_unlabel = None
80-
81- for labels_type , validation_labels in input_subsets .items ():
82- if rows_unlabeled :
83- rows_unlabel = validation_labels
84- missing_value = - 1
85-
86- input_dict [labels_type ] = (
87- generate_categoric_input_from_labels (
88- input_labels ,
89- 'two out label_input' ,
90- background_mat ,
91- missing_value ,
92- rows_unlabeled = rows_unlabel
93- ),
94- generate_categoric_input_from_labels (
95- validation_labels ,
96- 'two out label_input' ,
97- background_mat ,
98- missing_value ,
99- )
100- )
101- return input_dict
102-
103-
104- def get_metrics (validation_labels ,
105- scores
106- ):
107- """Return metrics."""
108- validation_labels_vec = validation_labels .__copy__ ()
109-
110- return metrics .roc_auc_score (validation_labels .mat , scores .mat ), metrics .average_precision_score (
111- validation_labels .mat , scores .mat )
112-
113-
114- def cross_validation_by_subset_same_diff_input (mapping_by_subsets ,
115- kernel ,
116- k = 3 ,
117- z = True ):
118- """Cross validation helper."""
119- auroc_metrics = defaultdict (list )
120- auprc_metrics = defaultdict (list )
121-
122- for i in tqdm (range (k )):
123- input_mat , validation_inputs_by_subsets = get_random_cv_inputs_from_subsets_same_diff_input (
124- mapping_by_subsets ,
125- kernel ,
126- )
127-
128- scores = diffuse_raw (graph = None , scores = input_mat , k = kernel , z = z )
129-
130- for entity , validation_labels in validation_inputs_by_subsets .items ():
131- auroc , auprc = get_metrics (validation_labels , scores )
132- auroc_metrics [entity ].append (auroc )
133- auprc_metrics [entity ].append (auprc )
134-
135- return auroc_metrics , auprc_metrics
136-
137-
138- def cross_validation_one_x_in (mapping_by_subsets ,
139- kernel ,
140- k = 1 ,
141- missing_value = - 1 ,
142- disjoint = False ,
143- rows_unlabeled = False ,
144- z = False
145- ):
146- """Cross validation one."""
147- auroc_metrics = defaultdict (lambda : defaultdict (list ))
148- auprc_metrics = defaultdict (lambda : defaultdict (list ))
149-
150- scores_dict = defaultdict (lambda : defaultdict (list ))
151- validation_dict = defaultdict (lambda : defaultdict (list ))
152- input_dict = defaultdict (lambda : defaultdict (list ))
153-
154- if disjoint :
155- mapping_by_subsets = random_disjoint_intersection_three_subsets (mapping_by_subsets )
156-
157- for i in tqdm (range (k )):
158-
159- for diffuse_input_type in tqdm (mapping_by_subsets ):
160- inputs = get_one_x_in_cv_inputs_from_subsets (
161- dict (mapping_by_subsets ),
162- kernel ,
163- one_in = diffuse_input_type ,
164- rows_unlabeled = rows_unlabeled ,
165- missing_value = missing_value ,
166- )
167-
168- for validation_type , validation_labels in inputs .items ():
169- input_diffuse , input_validation = validation_labels [0 ], validation_labels [1 ]
170-
171- # Input test
172- # validate_cross_validation_input_1(input_diffuse, input_validation, validation_input_from_dict(mapping_by_subsets, diffuse_input_type, validation_type, input_diffuse))
173-
174- # Run diffusion
175- scores = diffuse_raw (graph = None , scores = input_diffuse , k = kernel , z = z )
176-
177- scores .cols_labels = ['scores' ]
178- input_validation .cols_labels = ['input_validation' ]
179- input_diffuse .cols_labels = ['input_diffuse' ]
180-
181- auroc , auprc = get_metrics (input_validation , scores )
182-
183- auroc_metrics [diffuse_input_type ][validation_type ].append (auroc )
184- auprc_metrics [diffuse_input_type ][validation_type ].append (auprc )
185-
186- scores .col_bind (matrix = input_validation )
187- scores .col_bind (matrix = input_diffuse )
188-
189- scores_dict [diffuse_input_type ][validation_type ].append (scores )
21+ from .utils import split_random_two_subsets
19022
191- return dict ( auroc_metrics ), dict ( auprc_metrics ), dict ( scores_dict )
23+ """Random cross validation datasets functions"""
19224
19325
194- # Method cross validation_datasets
195-
196-
197- def generate_random_score_ranking (background_mat ):
198- """Generate random scores."""
199- return Matrix (
200- mat = np .random .rand (len (background_mat .rows_labels )),
201- rows_labels = background_mat .rows_labels ,
202- cols_labels = ['Radom_Baseline' ],
203- )
204-
205-
206- def cross_validation_by_method (all_labels_mapping ,
26+ def cross_validation_by_method (data_input ,
20727 graph ,
20828 kernel ,
20929 k = 100
@@ -213,26 +33,26 @@ def cross_validation_by_method(all_labels_mapping,
21333 auprc_metrics = defaultdict (list )
21434
21535 for _ in tqdm (range (k )):
216- input_diff , validation_diff = get_random_cv_split_input_and_validation (
217- all_labels_mapping , kernel
36+ input_diff , validation_diff = _get_random_cv_split_input_and_validation (
37+ data_input , kernel
21838 )
21939
22040 scores_z = diffuse_raw (graph = None , scores = input_diff , k = kernel , z = True )
22141 scores_raw = diffuse_raw (graph = None , scores = input_diff , k = kernel , z = False )
22242 scores_page_rank = generate_pagerank_baseline (graph , kernel )
22343
224- method_validation_inputs = {
44+ method_validation_scores = {
22545 'raw' : (validation_diff ,
22646 scores_raw
22747 ),
22848 'z' : (validation_diff ,
22949 scores_z
23050 ),
231- 'random_baseline ' : (
51+ 'random ' : (
23252 validation_diff ,
233- generate_random_score_ranking (kernel )
53+ _generate_random_score_ranking (kernel )
23454 ),
235- 'page_rank_baseline ' : (
55+ 'page_rank ' : (
23656 validation_diff ,
23757 scores_page_rank
23858 ),
@@ -245,3 +65,86 @@ def cross_validation_by_method(all_labels_mapping,
24565 auprc_metrics [method ].append (auprc )
24666
24767 return auroc_metrics , auprc_metrics
68+
69+
70+ """Helper functions for random cross-validation"""
71+
72+
73+ def _generate_random_score_ranking (background_mat ):
74+ """Generate random scores."""
75+ return Matrix (
76+ mat = np .random .rand (len (background_mat .rows_labels )),
77+ rows_labels = background_mat .rows_labels ,
78+ cols_labels = ['Radom_Baseline' ],
79+ )
80+
81+
82+ def _get_random_cv_split_input_and_validation (input : Union [list , set ],
83+ background_mat : Matrix
84+ ) -> Tuple [Matrix , Matrix ]:
85+ """Get random CV split."""
86+ input_labels , validation_labels = split_random_two_subsets (input )
87+
88+ if isinstance (validation_labels , dict ):
89+ validation_labels = process_input_data (validation_labels , binning = True , threshold = 0.5 )
90+
91+ return (
92+ format_input_for_diffusion (
93+ input_labels ,
94+ background_mat ,
95+ title = 'label_input with hidden true positives'
96+ ),
97+ format_input_for_diffusion (
98+ validation_labels ,
99+ background_mat ,
100+ title = 'original label_input labels'
101+ )
102+ )
103+
104+
105+ def _get_metrics (validation_labels ,
106+ scores
107+ ):
108+ """Return metrics."""
109+ return metrics .roc_auc_score (validation_labels .mat , scores .mat ), metrics .average_precision_score (
110+ validation_labels .mat , scores .mat )
111+
112+
113+ """Statistical test"""
114+
115+
116+ def get_p_values (metrics ):
117+ p_values = {}
118+
119+ result_list = map (dict , itertools .combinations (metrics .items (), 2 ))
120+
121+ for ch in result_list :
122+ values = list (ch .values ())
123+ ttest = stats .ttest_rel (a = values [0 ], b = values [1 ])
124+ p_values [str (tuple (ch .keys ()))] = ttest .pvalue
125+
126+ return p_values
127+
128+
129+ def get_p_values_multiple (metrics ):
130+ p_values = {}
131+
132+ for k , v in metrics [0 ].items ():
133+ ttest = stats .ttest_rel (a = v , b = metrics [1 ][k ])
134+ p_values [str (k )] = ttest .pvalue
135+
136+ return p_values
137+
138+
139+ def get_normalized_p_values (p_values ):
140+ normalized_p_values = {}
141+
142+ fdr = fdrcorrection (list (p_values .values ()),
143+ alpha = 0.05 ,
144+ method = 'indep' ,
145+ is_sorted = False )
146+
147+ for k1 , k2 in enumerate (p_values .keys ()):
148+ normalized_p_values [k2 ] = - math .log10 (fdr [1 ][k1 ])
149+
150+ return normalized_p_values
0 commit comments