maxATAC/maxatac/analyses/predict.py at 65c14ddbea9eb5eceb0aa836c96e451dc3a5c947 · MiraldiLab/maxATAC · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Predict TF binding with a maxATAC model
"""
import logging
import os
import glob
import timeit
import pandas as pd
import multiprocessing
import pybedtools
from multiprocessing import Pool, Manager
from maxatac.utilities.system_tools import get_dir, Mute

with Mute():
    from maxatac.utilities.genome_tools import build_chrom_sizes_dict
    from maxatac.utilities.peak_tools import get_threshold
    from maxatac.utilities.prediction_tools import write_predictions_to_bigwig, \
        create_prediction_regions, make_stranded_predictions


def run_prediction(args):
    """
    Predict TF binding with a maxATAC model. The user can provide a bed file of regions to predict on,
    called windows, or prediction regions can be created based on the chromosome of interest. The default prediction
    will predict across all autosomal chromosomes.

    Peak file requirements for prediction. You must have at least a 3 column file with chromosome, start,
    and stop coordinates.

    Windows file requirements for prediction. These windows will be directly input into the maxATAC data generator
    and should be 1,024 bp wide. The window step should be uniform across the chromosome.

    The user can decide whether to make only predictions on the forward strand or also make prediction on the reverse
    strand. If the user wants both strand, signal tracks will be produced for the forward, reverse, and mean-combined
    bigwig signal tracks will be produced.

    Workflow overview

    1) Create directories and set up filenames
    2) Prepare regions for prediction. Either import user defined regions or create regions based on chromosomes list.
    3) Make predictions.
    4) Convert predictions to bigwig format and write results.
    5) Write predictions to an optional BED formated file of regions above a specific threshold.

    Args: TF, output_directory, name, signal, sequence, model, threads, batch_size, roi, cutoff_type, cutoff_value,
    cutoff_file, chrom_sizes, blacklist, average, windows, loglevel, step_size, chromosomes, skip_call_peaks
    """
    # Start Timer
    startTime = timeit.default_timer()

    # If the user provides the TF name,
    if args.TF:
        args.model = glob.glob(os.path.join(args.DATA_PATH, "models", args.TF, args.TF + "*.h5"))[0]

        args.cutoff_file = glob.glob(os.path.join(args.DATA_PATH, "models", args.TF, args.TF + "*.tsv"))[0]

    else:
        pass

    logging.info(f"Using maxATAC model: {args.model} to make predictions")

    # predict on all chromosomes
    if args.chromosomes[0] == 'all':
        from maxatac.utilities.constants import AUTOSOMAL_CHRS as all_chr
        args.chromosomes = all_chr

    # Create the output directory set by the parser
    logging.debug(f"Make output directory: {args.output_directory}")
    output_directory = get_dir(args.output_directory)

    # Output filename for the bigwig predictions file based on the output directory and the prefix. Add the bw extension
    outfile_name_bigwig = os.path.join(output_directory, args.name + ".bw")

    # The function build_chrom_sizes_dict is used to make sure regions fall within chromosome bounds.
    # Create a dictionary of chromosome sizes used to make the bigwig files
    chrom_sizes_dict = build_chrom_sizes_dict(args.chromosomes, args.chrom_sizes)
    logging.debug(f"Chromosome size dictionary: {chrom_sizes_dict}")

    # Import the regions for prediction.
    logging.info("Create prediction regions")
    regions_pool = create_prediction_regions(chromosomes=args.chromosomes,
                                             chrom_sizes=args.chrom_sizes,
                                             blacklist=args.blacklist,
                                             step_size=args.step_size,
                                             peaks=args.roi,
                                             windows=args.windows
                                             )

    # Find the chromosomes for which we can make predictions based on the requested chroms
    # and the BED regions provided in the ROI file
    chrom_list = list(set(args.chromosomes).intersection(set(regions_pool["chr"].unique())))

    logging.info("Prediction Parameters \n" +
                 f"Output filename: {outfile_name_bigwig} \n" +
                 f"Target signal: {args.signal} \n" +
                 f"Sequence data: {args.sequence} \n" +
                 f"Model: {args.model} \n" +
                 "Chromosome requested: \n   - " + "\n    -".join(args.chromosomes) + "\n" +
                 "Chromosomes in final prediction set: \n   - " + "\n    -".join(chrom_list) + "\n" +
                 f"Output directory: {output_directory} \n" +
                 f"Batch Size: {args.batch_size} \n" +
                 f"Output filename: {outfile_name_bigwig}"
                 )

    with Pool(args.threads) as p:
        forward_strand_predictions = p.starmap(make_stranded_predictions,
                                               [(regions_pool,
                                                 args.signal,
                                                 args.sequence,
                                                 args.model,
                                                 args.batch_size,
                                                 False,
                                                 chromosome) for chromosome in chrom_list])

    # Write the predictions to a bigwig file and add name to args
    prediction_bedgraph = pd.concat(forward_strand_predictions)

    logging.info("Write predictions to a bigwig file")
    write_predictions_to_bigwig(prediction_bedgraph,
                                output_filename=outfile_name_bigwig,
                                chrom_sizes_dictionary=chrom_sizes_dict,
                                chromosomes=chrom_list
                                )

    # If a cutoff file is provided, call peaks
    if args.cutoff_file and args.skip_call_peaks is False:
        args.input_bigwig = outfile_name_bigwig

        peaks_filename = os.path.join(output_directory, args.name + "_peaks.bed")

        thresh = get_threshold(cutoff_file=args.cutoff_file,
                               cutoff_type=args.cutoff_type,
                               cutoff_val=args.cutoff_value)

        logging.info(f"Writing predictions to a BED file: {peaks_filename}" +
                     f"\n Cutoff type for Threshold: {args.cutoff_type}" +
                     f"\n Cutoff value: {args.cutoff_value}" +
                     f"\n Corresponding Threshold for Cutoff Type and Value discovered: {thresh}" +
                     f"\n Output filename: {peaks_filename}")

        logging.debug(f"Filtering predictions by threshold score.")
        prediction_bedgraph = prediction_bedgraph[prediction_bedgraph["score"] >= thresh]

        # Convert the dataframe to a bedtools object
        peaks_bedtool = pybedtools.BedTool.from_dataframe(prediction_bedgraph)

        # Sort the bed intervals
        sorted_peaks = peaks_bedtool.sort()

        # Merge overlapping intervals
        merged_peaks = sorted_peaks.merge(c=4, o="max")

        # Convert bedtool object to dataframe
        prediction_bed = merged_peaks.to_dataframe()

        logging.debug(f"Writing BED file.")
        # Write dataframe to a bed format file
        prediction_bed.to_csv(peaks_filename,
                  sep="\t",
                  index=False,
                  header=False)


    # Measure end time of training
    stopTime = timeit.default_timer()
    totalTime = stopTime - startTime

    # Output running time in a nice format.
    mins, secs = divmod(totalTime, 60)
    hours, mins = divmod(mins, 60)

    logging.info("Total Prediction time: %d:%d:%d.\n" % (hours, mins, secs))