BayesianBootstrap/example/bootstrap.py at 5a8ad750ccecbe6ab3ea4eb4a955f06175bbb61b · ZuckermanLab/BayesianBootstrap · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/python

#import scipy
import random
import numpy as np
from tqdm.auto import tqdm

################################################
############## DEFINING CI FUNCTION ############
################################################

def get_CI(List, Repeat):
    """
    Standard bootstrapping CIs.
    """
    if len(set(List)) == 1 :  	# NO CI if List has only identical elements
        return [0,0]

    else :
        AllMeans = []		# List of data(!) sample means for every bootstrap iteration
        N = len(List)                # Number of data points

        for i in range(Repeat) :	# Repeated bootstrap iterations
            CurrList = []		# Sample list
            for j in range(N) :
                CurrList.append(random.choice(List))
            AllMeans.append(np.average(CurrList))

    perc_min  = np.percentile(AllMeans,2.5)	# Minimum percentile defined over list of means
    perc_max  = np.percentile(AllMeans,97.5) 	# Maximum percentile defined over list of means

    return [perc_min, perc_max]	# Confidence Interval is defined by min/max percentiles of sampling means


################################################
############## DEFINING CR FUNCTION ############
################################################

def get_CR_bm(List, Repeat):
    """
    Original bayesian bootstrapping function from BM.
    """
    if len(set(List)) == 1 :  	# NO CR if List has only identical elements
        return [0,0]

    else :
        AllMeans = []		# List of model(!) sample means for every bootstrap iteration
        N = len(List)		# Number of data points

        for i in range(Repeat) :	# Repeated bootstrap iterations
            Rands = [0]		# Following Rubin et al. to get data probabilities from Dirichlet distrib.
            CurrAvg = 0
            for j in range(N-1) :
                Rands.append(random.random())
            Rands.append(1)
            Rands.sort()
            P=np.diff(Rands)	# List of random numbers that add to 1 and are used as data probabilities
            for j in range(N) :
                CurrAvg += P[j]*List[j]	# Sample mean
            AllMeans.append(CurrAvg)

    AllMeans.sort()
    TotalProb = len(AllMeans)
    CumulProb = 0
    perc_min  = 0
    perc_max  = 0
    for m in AllMeans :	# Iterating through sorted means, identifying that mean at which a certain percentile of probs is reached
        CumulProb += 1
        if (CumulProb > 0.025*TotalProb) and (perc_min == 0) :
            perc_min = m
        if (CumulProb > 0.975*TotalProb) and (perc_max == 0):
            perc_max = m

    return [perc_min, perc_max]		# Credibility Region is defined by min/max percentiles of sampling means

#####################################################################
################## Updated functions using numpy ####################
#####################################################################

def get_CR_single(rates, repeat):
    """
    Get a set of min and max credibility regions (CRs) from bayesian
    bootstrapping for a 1d array of n_replicates at a single timepoint.

    Parameters
    ----------
    rates : 1darray
        Rates of each replicate at a single timepoint.
    repeat : int
        n-fold bayesian bootstrapping.

    Returns
    -------
    CRs : 1darray (min CR | max CR)
        2 item array of calculated min and max CRs for the input rates array.
    """
    # check if all elements in the rates array are identical
    if np.unique(rates).size == 1:
        return np.array([0, 0])

    else:
        all_means = np.zeros(repeat)

        # n-fold (n repeat) bayesian bootstrapping
        for i in range(repeat):
            # get data probabilities from Dirichlet distrib.
            rands = np.random.dirichlet(np.ones(len(rates)))
            # calculate the sample mean using dot product
            all_means[i] = np.dot(rands, rates)

        # identifying the sorted mean at which a certain percentile of probs is reached
        all_means.sort()
        total_prob = repeat
        cumul_prob = np.arange(1, total_prob + 1)
        # calculate the percentile values
        perc_min = all_means[np.argmax(cumul_prob > 0.025 * total_prob)]
        perc_max = all_means[np.argmax(cumul_prob > 0.975 * total_prob)]

        return np.array([perc_min, perc_max])

def get_CR_multi(rates_multi, repeat):
    """
    Calculate the credibility regions of multiple timepoints.
    Input `rates_multi` array should have n_replicate rows and
    n_timepoints columns.

    Parameters
    ----------
    rates_multi : 2darray
        Rates for multiple replicates at multiple timepoints.
    repeats : int
        n-fold bayesian bootstrapping.

    Returns
    -------
    CRs : 2darray
        Calculated min and max CRs for n_replicates at each timepoint.
        n_timepoints rows and 2 columns (min CR and max CR).
    """
    # CRs will be 2 columns (min and max) and n_frames rows
    CRs = np.zeros((rates_multi.shape[1], 2))
    # loop each timepoint, so each set of n_replicate rates, must transpose
    for i, rep_rates in enumerate(tqdm(rates_multi.T)):
        CRs[i,:] = get_CR_single(rep_rates, repeat)
    return CRs

# check the updated numpy version works
# rates = [0.02, 0.005, 0.033]
# print(get_CR_bm(rates, 10000))
# print(get_CR_single(rates, 10000))