DiffuPath cli refactors, adapted to the recoding in cross validation

jmarinllao · jmarinllao · commit b64b4828363a · 2020-05-10T17:14:59.000+02:00
diff --git a/src/diffupath/cli.py b/src/diffupath/cli.py
@@ -5,26 +5,26 @@
 import json
 import logging
 import sys
+from collections import defaultdict
 
 import click
+import networkx as nx
 from bio2bel.constants import get_global_connection
+from diffupath.utils import reduce_dict_dimension
+from diffupy.constants import EMOJI, RAW, CSV, JSON
 from diffupy.diffuse import diffuse as run_diffusion
-from diffupy.process_input import process_input
-from diffupy.utils import process_network_from_cli
+from diffupy.process_input import process_map_and_format_input_data_for_diff
+from diffupy.process_network import get_kernel_from_network_path, process_kernel_from_file, process_graph_from_file
+from diffupy.utils import from_json, to_json
 
 from .constants import *
 from .cross_validation import cross_validation_by_method
-from .input_mapping import process_input_from_cli
-from .validation_datasets_parsers import parse_set1, parse_set2, parse_set3
 
 logger = logging.getLogger(__name__)
 
-#: Parsing methods for each dataset
-PARSING_METHODS = {
-    '1': parse_set1,
-    '2': parse_set2,
-    '3': parse_set3,
-}
+GRAPH_PATH = os.path.join(DEFAULT_DIFFUPATH_DIR, 'pickles', 'universe',
+                          'pathme_universe_non_flatten_collapsed_names_no_isolates_16_03_2020.pickle')
+KERNEL_PATH = os.path.join(DEFAULT_DIFFUPATH_DIR, 'kernels', 'kernel_regularized_pathme_universe.pickle')
 
 
 @click.group(help='DiffuPy')
@@ -40,16 +40,16 @@ def diffusion():
 
 @diffusion.command()
 @click.option(
-    '-n', '--network',
-    help='Path to the network graph or kernel',
+    '-i', '--input',
+    help='Input data',
     required=True,
-    type=click.Path(exists=True, dir_okay=False)
+    type=click.Path(exists=True, dir_okay=True)
 )
 @click.option(
-    '-i', '--data',
-    help='Input data',
-    required=True,
-    type=click.Path(exists=True, dir_okay=False)
+    '-n', '--network',
+    help='Path to the network graph or kernel',
+    default=KERNEL_PATH,
+    type=click.Path(exists=True, dir_okay=True)
 )
 @click.option(
     '-o', '--output',
@@ -61,28 +61,29 @@ def diffusion():
     '-m', '--method',
     help='Diffusion method',
     type=click.Choice(METHODS),
-    required=True,
+    default=RAW,
 )
 @click.option(
     '-b', '--binarize',
     help='If logFC provided in dataset, convert logFC to binary (e.g., up-regulated entities to 1, down-regulated to '
          '-1). For scoring methods that accept quantitative values (i.e., raw & z), node labels can also be codified '
          'with LogFC (in this case, set binarize==False).',
     type=bool,
-    default=True,
+    default=False,
     show_default=True,
 )
 @click.option(
     '-t', '--threshold',
     help='Codify node labels by applying a threshold to logFC in input.',
+    default=None,
     type=float,
 )
 @click.option(
     '-a', '--absolute_value',
-    help='Codify node labels by applying threshold to |logFC| in input. If absolute_value is set to False, node labels '
-         'will be signed.',
+    help='Codify node labels by applying threshold to | logFC | in input. If absolute_value is set to False,'
+         'node labels will be signed.',
     type=bool,
-    default=True,
+    default=False,
     show_default=True,
 )
 @click.option(
@@ -92,72 +93,83 @@ def diffusion():
     default=0.05,
     show_default=True,
 )
-def diffuse(
-    network: str,
-    data: str,
-    output: str,
-    method: str,
-    binarize: bool,
-    absolute_value: bool,
-    threshold: float,
-    p_value: float,
+@click.option(
+    '-f', '--output_format',
+    help='Statistical significance (p-value).',
+    type=float,
+    default=CSV,
+    show_default=True,
+)
+def run(
+        input: str,
+        network: str = KERNEL_PATH,
+        output: str = OUTPUT_DIR,
+        method: str = RAW,
+        binarize: bool = False,
+        threshold: float = None,
+        absolute_value: bool = False,
+        p_value: float = 0.05,
+        output_format: str = CSV
 ):
     """Run a diffusion method over a network or pre-generated kernel."""
     click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
-    graph = process_network_from_cli(network)
 
-    click.secho(
-        f'{EMOJI} Graph loaded with: \n'
-        f'{graph.number_of_nodes()} nodes\n'
-        f'{graph.number_of_edges()} edges\n'
-        f'{EMOJI}'
-    )
+    kernel = get_kernel_from_network_path(network)
 
-    click.secho(f'Codifying data from {data}.')
+    click.secho(f'Processing data input from {input}.')
 
-    input_scores_dict = process_input(data, method, binarize, absolute_value, p_value, threshold)
+    input_scores_dict = process_map_and_format_input_data_for_diff(input,
+                                                                   kernel,
+                                                                   method,
+                                                                   binarize,
+                                                                   absolute_value,
+                                                                   p_value,
+                                                                   threshold,
+                                                                   )
 
-    click.secho(f'Running the diffusion algorithm.')
+    click.secho(f'Computing the diffusion algorithm.')
 
     results = run_diffusion(
         input_scores_dict,
         method,
-        graph,
+        k=kernel
     )
 
-    json.dump(results, output, indent=2)
+    if output_format is CSV:
+        results.to_csv(output)
+
+    elif output_format is JSON:
+        json.dump(results, output, indent=2)
 
-    click.secho(f'Finished!')
+    click.secho(f'{EMOJI} Diffusion performed with success. Output located at {output} {EMOJI}')
 
 
 @diffusion.command()
 @click.option(
-    '-d', '--data',
-    help='Input data',
-    required=True,
-    type=click.Path(exists=True, dir_okay=False),
+    '-c', '--comparison',
+    help='Comparison method',
+    default=BY_METHOD,
+    show_default=True,
+    type=click.Choice(EVALUATION_COMPARISONS),
 )
 @click.option(
-    '-n', '--network',
-    help='Path to the network graph or kernel',
-    required=True,
+    '-i', '--input_path',
+    default=os.path.join(ROOT_RESULTS_DIR, 'data', 'input_mappings'),
+    show_default=True,
+    type=click.Path(exists=True, dir_okay=True),
+)
+@click.option(
+    '-k', '--kernel',
+    help='Path to the kernel',
+    default=GRAPH_PATH,
     type=click.Path(exists=True, dir_okay=False)
 )
 @click.option(
-    '-g', '--graph_path',
+    '-g', '--graph',
     help='Path to the network as a graph',
+    default=KERNEL_PATH,
     type=click.Path(exists=True, dir_okay=False),
 )
-@click.option(
-    '-q', '--quantitative',  # TODO Automatize if possible, check type of label_input.
-    help='Generate categorical label_input from labels',
-    is_flag=False,
-)
-@click.option(
-    '-n', '--network_as_graph',
-    help='If given expects graph else expects as a kernel',
-    is_flag=False,
-)
 @click.option(
     '-o', '--output',
     help='Output path for the results',
@@ -172,78 +184,83 @@ def diffuse(
     show_default=True,
     type=int,
 )
-@click.option(
-    '-c', '--comparison',
-    help='Comparison method',
-    default='by_method',
-    show_default=True,
-    type=click.Choice(EVALUATION_METHODS),
-)
-@click.option(
-    '-k', '--dataset',
-    help='Key for the datasets presented in the paper',
-    show_default=True,
-    default=1,
-    type=click.Choice(DATASETS),
-)
 def evaluate(
-    data: str,
-    network: str,
-    graph_path: str,
-    quantitative: bool,  # TODO Automatize if possible, check type of label_input.
-    network_as_graph: bool,  # TODO Automatize if possible, check type of graph.
-    output: str,
-    iterations: int,
-    comparison: str,
-    dataset: int,
+        comparison: str = BY_METHOD,
+        input_path: str = os.path.join(ROOT_RESULTS_DIR, 'data', 'input_mappings'),
+        graph: str = GRAPH_PATH,
+        kernel: str = KERNEL_PATH,
+        output: str = OUTPUT_DIR,
+        iterations: int = 100,
 ):
     """Evaluate a kernel/network on one of the three presented datasets."""
-    click.secho(f'{EMOJI} Loading label_input for cross-validation... {EMOJI}')
-
-    if not network_as_graph and not graph_path:
-        raise ValueError("Network not provided in graph format, which is required for evaluation.")
-
-    _, kernel, labels_mapping, graph = process_input_from_cli(
-        PARSING_METHODS[dataset],
-        network,
-        data,
-        network_as_graph,
-        quantitative,
-    )
+    click.secho(f'{EMOJI} Loading network for random cross-validation... {EMOJI}')
+    graph = process_graph_from_file(graph)
+    kernel = process_kernel_from_file(kernel)
+
+    nx.number_of_isolates(graph)
+    graph.remove_nodes_from({
+        node
+        for node in nx.isolates(graph)
+    })
+
+    graph.summarize()
+
+    click.secho(f'{EMOJI} Loading data for cross-validation... {EMOJI}')
+    MAPPING_PATH_DATASET_1 = os.path.join(input_path, 'dataset_1_mapping.json')
+    dataset1_mapping_by_database_and_entity = from_json(MAPPING_PATH_DATASET_1)
+    dataset1_mapping_by_database = reduce_dict_dimension(dataset1_mapping_by_database_and_entity)
+    dataset1_mapping_all_labels = {entity: entity_value
+                                   for entity_type, entity_set in dataset1_mapping_by_database.items()
+                                   for entity, entity_value in entity_set.items()
+                                   }
+
+    MAPPING_PATH_DATASET_2 = os.path.join(input_path, 'dataset_2_mapping.json')
+    dataset2_mapping_by_database_and_entity = from_json(MAPPING_PATH_DATASET_2)
+    dataset2_mapping_by_database = reduce_dict_dimension(dataset2_mapping_by_database_and_entity)
+    dataset2_mapping_all_labels = {entity: entity_value
+                                   for entity_type, entity_set in dataset2_mapping_by_database.items()
+                                   for entity, entity_value in entity_set.items()
+                                   }
+
+    MAPPING_PATH_DATASET_3 = os.path.join(input_path, 'dataset_3_mapping.json')
+    dataset3_mapping_by_database_and_entity = from_json(MAPPING_PATH_DATASET_3)
+    dataset3_mapping_by_database = reduce_dict_dimension(dataset3_mapping_by_database_and_entity)
+    dataset3_mapping_all_labels = {entity: entity_value
+                                   for entity_type, entity_set in dataset3_mapping_by_database.items()
+                                   for entity, entity_value in entity_set.items()
+                                   }
+
+    if comparison == BY_METHOD:
+        click.secho(f'{EMOJI} Evaluating by method... {EMOJI}')
 
-    if not network_as_graph:
-        graph = process_network_from_cli(graph_path)
+        metrics_by_method = defaultdict(lambda: defaultdict(lambda: list))
 
-    if comparison == 'by_method':
-        click.secho(f'{EMOJI} Evaluating by method... {EMOJI}')
+        click.secho(f'{EMOJI} Running cross_validation_by_method for Dataset 1... {EMOJI}')
+        metrics_by_method['auroc']['Dataset 1'], metrics_by_method['auprc']['Dataset 1'] = cross_validation_by_method(
+            dataset1_mapping_all_labels,
+            graph,
+            kernel,
+            k=iterations)
 
-        auroc_metrics, auprc_metrics = cross_validation_by_method(
-            labels_mapping,
+        click.secho(f'{EMOJI} Running cross_validation_by_method for Dataset 2... {EMOJI}')
+        metrics_by_method['auroc']['Dataset 2'], metrics_by_method['auprc']['Dataset 2'] = cross_validation_by_method(
+            dataset2_mapping_all_labels,
             graph,
             kernel,
-            k=iterations,
-        )
-    elif comparison == 'by_db':
-        click.secho(f'{EMOJI} Evaluating by database... {EMOJI}')
-
-        # TODO to adapt from 'get_one_x_in_cv_inputs_from_subsets', and label_input treatment subset division.
-        auroc_metrics, auprc_metrics = cross_validation_by_method(
-            labels_mapping,
+            k=iterations)
+
+        click.secho(f'{EMOJI} Running cross_validation_by_method for Dataset 3... {EMOJI}')
+        metrics_by_method['auroc']['Dataset 3'], metrics_by_method['auprc']['Dataset 3'] = cross_validation_by_method(
+            dataset3_mapping_all_labels,
             graph,
             kernel,
-            k=iterations,
-        )
+            k=iterations)
+
+
     else:
         raise ValueError("The comparison method provided not match any provided method.")
 
-    with open(os.path.join(output, 'metrics.json'), 'w') as outfile:
-        json.dump(
-            {'auroc_metrics': auroc_metrics,
-             'auprc_metrics': auprc_metrics
-             },
-            outfile,
-            indent=2,
-        )
+    to_json(metrics_by_method, output)
 
     click.secho(f'{EMOJI} Random cross-validation performed with success. Output located at {output}... {EMOJI}')
 
diff --git a/src/diffupath/topological_analyses.py b/src/diffupath/topological_analyses.py
@@ -9,7 +9,7 @@
 import networkx as nx
 import numpy as np
 from diffupy.matrix import LaplacianMatrix, Matrix
-from diffupy.utils import get_simple_graph_from_multigraph
+from diffupy.process_network import get_simple_graph_from_multigraph
 
 
 def generate_pagerank_baseline(graph: nx.Graph, background_mat: Matrix) -> Matrix: