Skip to content

Commit af9bda3

Browse files
committed
Check in prototype of precompute code
1 parent c0753f5 commit af9bda3

1 file changed

Lines changed: 350 additions & 0 deletions

File tree

db/precompute_stats/precompute.py

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
# This code pasted together from prototyping notebook. Untested.
2+
from getpass import getpass
3+
from datetime import datetime
4+
from collections import namedtuple
5+
import sys
6+
import logging
7+
8+
import numpy as np
9+
import pandas as pd
10+
from scipy.stats import poisson, chisquare
11+
from sqlalchemy import create_engine
12+
import mysql.connector
13+
14+
15+
logFormatter = logging.Formatter("%(asctime)s %(levelname)s %(module)s:%(lineno)d: %(message)s")
16+
rootLogger = logging.getLogger()
17+
rootLogger.handlers.clear()
18+
fileHandler = logging.handlers.RotatingFileHandler(f"prototype.log", maxBytes=10e6)
19+
fileHandler.setFormatter(logFormatter)
20+
fileHandler.setLevel(logging.INFO)
21+
rootLogger.addHandler(fileHandler)
22+
consoleHandler = logging.StreamHandler(sys.stdout)
23+
consoleHandler.setFormatter(logFormatter)
24+
consoleHandler.setLevel(logging.INFO)
25+
rootLogger.addHandler(consoleHandler)
26+
rootLogger.setLevel(logging.INFO)
27+
28+
pwd = getpass()
29+
engine = create_engine(f'mysql+mysqlconnector://admin:{pwd}@tr-kp-clinical-db.ncats.io/cohd')
30+
conn = engine.connect()
31+
32+
connection2 = mysql.connector.connect(host='tr-kp-clinical-db.ncats.io',
33+
database='cohd',
34+
user='admin',
35+
password=pwd)
36+
37+
sql = '''
38+
SELECT COUNT(*)
39+
FROM concept_counts
40+
WHERE dataset_id = %(dataset_id)s;
41+
'''
42+
for dataset_id in range(1, 5):
43+
cur = conn.exec_driver_sql(sql, {'dataset_id': dataset_id})
44+
n_rows = cur.fetchone()[0]
45+
print(f'{dataset_id}: {n_rows} rows')
46+
47+
CohdCalculations = namedtuple('CohdCalculations', ['p_value', 'pair_count_ci', 'ln_ratio', 'ln_ratio_ci',
48+
'log_odds', 'log_odds_ci'])
49+
50+
51+
def poisson_ci(freq, confidence=0.99):
52+
""" Assuming two Poisson processes (1 for the event rate and 1 for randomization), calculate the confidence interval
53+
for the true rate
54+
55+
Parameters
56+
----------
57+
freq: float - co-occurrence frequency
58+
confidence: float - desired confidence. range: [0, 1]
59+
60+
Returns
61+
-------
62+
(lower bound, upper bound)
63+
"""
64+
alpha = 1 - confidence
65+
ci = poisson.ppf([alpha / 2, 1 - alpha / 2], freq)
66+
ci[0] = max(ci[0], 1) # min possible count is 1
67+
ci = int(ci[0]), int(ci[1])
68+
return ci
69+
70+
71+
def double_poisson_ci(freq, confidence=0.99):
72+
""" Assuming two Poisson processes (1 for the event rate and 1 for randomization), calculate the confidence interval
73+
for the true rate
74+
75+
Parameters
76+
----------
77+
freq: float - co-occurrence frequency
78+
confidence: float - desired confidence. range: [0, 1]
79+
80+
Returns
81+
-------
82+
(lower bound, upper bound)
83+
"""
84+
# # Adjust the interval for each individual poisson to achieve overall confidence interval
85+
# confidence_adjusted = 1 - (1 - confidence) ** 0.5
86+
# return (poisson.interval(confidence_adjusted, poisson.interval(confidence_adjusted, freq)[0])[0],
87+
# poisson.interval(confidence_adjusted, poisson.interval(confidence_adjusted, freq)[1])[1])
88+
89+
# More efficient calculation using a single call to poisson.interval with similar results as above
90+
# Adjust the interval for each individual poisson to achieve overall confidence interval
91+
confidence_adjusted = 1 - ((1 - confidence) ** 1.5)
92+
return poisson_ci(freq, confidence_adjusted)
93+
94+
95+
def ln_ratio_ci(freq, ln_ratio, confidence=0.99, replace_inf=None):
96+
""" Estimates the confidence interval of the log ratio using the double poisson method
97+
98+
Parameters
99+
----------
100+
freq: float - co-occurrence count
101+
ln_ratio: float - log ratio
102+
confidence: float - desired confidence. range: [0, 1]
103+
replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
104+
doesn't allow Infinity)
105+
106+
Returns
107+
-------
108+
(lower bound, upper bound)
109+
"""
110+
# Convert ln_ratio back to ratio and calculate confidence intervals for the ratios
111+
ci = tuple(np.log(np.array(double_poisson_ci(freq, confidence)) * np.exp(ln_ratio) / freq))
112+
if replace_inf:
113+
ci = max(ci[0], -replace_inf), min(ci[1], replace_inf)
114+
return ci
115+
116+
117+
def rel_freq_cis(pair_count, count_1, count_2, confidence=0.99):
118+
""" Estimates the confidence interval of the relative frequency using the double poisson method
119+
120+
Parameters
121+
----------
122+
pair_count: int - co-occurrence count
123+
base_count: int - base concept count
124+
confidence: float - desired confidence. range: [0, 1]
125+
replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
126+
doesn't allow Infinity)
127+
128+
Returns
129+
-------
130+
(lower bound, upper bound)
131+
"""
132+
pair_count_ci = poisson_ci(pair_count, confidence)
133+
count_1_ci = poisson_ci(count_1, confidence)
134+
count_2_ci = poisson_ci(count_2, confidence)
135+
ci_1 = pair_count_ci[0] / count_1_ci[1], pair_count_ci[1] / count_1_ci[0]
136+
ci_2 = pair_count_ci[0] / count_2_ci[1], pair_count_ci[1] / count_2_ci[0]
137+
return ci_1, ci_2
138+
139+
140+
def log_odds(c1, c2, cp, n, replace_inf=np.inf):
141+
""" Calculates the log-odds and 95% CI
142+
143+
Params
144+
------
145+
c1: count for concept 1
146+
c2: count for concept 2
147+
cp: concept-pair count
148+
n: total population size
149+
replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
150+
doesn't allow Infinity)
151+
152+
Returns
153+
-------
154+
(log-odds, [95% CI lower bound, 95% CI upper bound])
155+
"""
156+
a = cp
157+
b = c1 - cp
158+
c = c2 - cp
159+
d = n - c1 - c2 + cp
160+
# print('--------------')
161+
# print(c1, c2, cp, n)
162+
# print(a, b, c, d)
163+
# Check b/c <= 0 since Poisson perturbation can cause b or c to be negative
164+
if b <= 0 or c <= 0:
165+
if a == 0:
166+
return 0, [0, 0]
167+
else:
168+
return replace_inf, [replace_inf, replace_inf]
169+
else:
170+
log_odds = np.log((a * d) / (b * c))
171+
ci = 1.96 * np.sqrt(1 / a + 1 / b + 1 / c + 1 / d)
172+
# Strict JSON doesn't allow Inf values, replace as necessary
173+
ci = [log_odds - ci, log_odds + ci]
174+
return log_odds, ci
175+
176+
177+
def chi(c1, c2, cpc, pts):
178+
""" Calculates chi-square p-value
179+
180+
Params
181+
------
182+
c1: count for concept 1
183+
c2: count for concept 2
184+
cpc: concept-pair count
185+
pts: total population size
186+
187+
Returns
188+
-------
189+
chi-square -value (unadjusted)
190+
"""
191+
neg = pts - c1 - c2 + cpc
192+
# Create the observed and expected RxC tables and perform chi-square
193+
o = [neg, c1 - cpc, c2 - cpc, cpc]
194+
e = [(pts - c1) * (pts - c2) / pts, c1 * (pts - c2) / pts, c2 * (pts - c1) / pts, c1 * c2 / pts]
195+
cs = chisquare(o, e, 2)
196+
return cs.pvalue
197+
198+
199+
def calculations(count_1, count_2, pair_count, patient_count):
200+
count_1 = float(count_1)
201+
count_2 = float(count_2)
202+
pair_count = float(pair_count)
203+
patient_count = float(patient_count)
204+
p = chi(count_1, count_2, pair_count, patient_count)
205+
pair_count_ci = poisson_ci(pair_count, confidence=0.99)
206+
ln_ratio = np.log(pair_count * patient_count / (count_1 * count_2))
207+
lr_ci = ln_ratio_ci(pair_count, ln_ratio, confidence=0.99)
208+
# rf1_ci, rf2_ci = rel_freq_cis(pair_count, count_1, count_2, confidence=0.99)
209+
lo, lo_ci = log_odds(count_1, count_2, pair_count, patient_count, )
210+
return CohdCalculations(p, pair_count_ci, ln_ratio, lr_ci, lo, lo_ci)
211+
212+
### Update concept_counts table ###
213+
214+
sql_fetch = '''
215+
SELECT concept_id, concept_count
216+
FROM concept_counts
217+
WHERE dataset_id = %(dataset_id)s
218+
'''
219+
220+
sql_update = '''
221+
UPDATE concept_counts
222+
SET ci_lo = %s, ci_hi = %s
223+
WHERE dataset_id = %s AND concept_id = %s;
224+
'''
225+
cur_update = connection2.cursor(prepared=True)
226+
227+
CONFIDENCE = 0.99
228+
dataset_ids = [1, 2, 3, 4]
229+
for dataset_id in dataset_ids:
230+
t1 = datetime.now()
231+
cur_fetch = conn.exec_driver_sql(sql_fetch, {'dataset_id': dataset_id})
232+
count = 0
233+
params_list = list()
234+
for row in cur_fetch:
235+
concept_id, concept_count = row
236+
ci_lo, ci_hi = poisson_ci(concept_count, CONFIDENCE)
237+
params_list.append((ci_lo, ci_hi, dataset_id, concept_id))
238+
239+
count += 1
240+
if count % 5000 == 0:
241+
print(count)
242+
243+
cur_update.executemany(sql_update, params_list)
244+
connection2.commit()
245+
246+
delta = datetime.now() - t1
247+
print(f'{delta.total_seconds()} seconds')
248+
249+
### Update concept_pair_counts table ###
250+
251+
connection = mysql.connector.connect(host='tr-kp-clinical-db.ncats.io',
252+
database='cohd',
253+
user='admin',
254+
password=pwd)
255+
cursor = connection.cursor()
256+
257+
sql_patients = '''
258+
SELECT count
259+
FROM patient_count
260+
WHERE dataset_id = %s;
261+
'''
262+
263+
sql_counts = '''
264+
SELECT concept_id, concept_count
265+
FROM concept_counts
266+
WHERE dataset_id = %s
267+
ORDER BY concept_id ASC;
268+
'''
269+
270+
# splitting up the query into 2 queries looking for concept_id_1 and concept_id_2 separately is MUCH faster
271+
sql_pair_counts_1 = '''
272+
SELECT cc.concept_id, cc.concept_count, cpc.concept_count AS pair_count
273+
FROM concept_pair_counts cpc
274+
JOIN concept_counts cc ON cpc.dataset_id = cc.dataset_id AND cpc.concept_id_2 = cc.concept_id
275+
WHERE cpc.dataset_id = %s AND concept_id_1 = %s AND cpc.p_value IS NULL
276+
'''
277+
278+
sql_pair_counts_2 = '''
279+
SELECT cc.concept_id, cc.concept_count, cpc.concept_count AS pair_count
280+
FROM concept_pair_counts cpc
281+
JOIN concept_counts cc ON cpc.dataset_id = cc.dataset_id AND cpc.concept_id_1 = cc.concept_id
282+
WHERE cpc.dataset_id = %s AND concept_id_2 = %s AND cpc.p_value IS NULL
283+
'''
284+
285+
sql_update = '''
286+
UPDATE concept_pair_counts
287+
SET p_value = %s, pair_count_ci_lo = %s, pair_count_ci_hi = %s,
288+
ln_ratio = %s, ln_ratio_ci_lo = %s, ln_ratio_ci_hi = %s,
289+
log_odds = %s, log_odds_ci_lo = %s, log_odds_ci_hi = %s
290+
WHERE dataset_id = %s AND concept_id_1 = %s AND concept_id_2 = %s;
291+
'''
292+
cur_update = connection.cursor(prepared=True)
293+
294+
n_rows_dataset = [15927195, 32788901, 197683321, 56848043]
295+
datasets = [2, 4]
296+
for dataset_id in datasets:
297+
n_rows = n_rows_dataset[dataset_id - 1]
298+
logging.info(f'######## dataset_id {dataset_id} - {n_rows} rows ########')
299+
300+
# Get patient count
301+
cursor.execute(sql_patients, (dataset_id,))
302+
patient_count = float(cursor.fetchall()[0][0])
303+
304+
cursor.execute(sql_counts, (dataset_id,))
305+
n_a = 0
306+
n_b = 0
307+
t1 = datetime.now()
308+
for row_counts in cursor.fetchall():
309+
concept_id_a, count_a = row_counts
310+
logging.debug(f'concept_id_a: {concept_id_a}')
311+
312+
cursor.execute(sql_pair_counts_1, (dataset_id, concept_id_a,))
313+
sql_params = list()
314+
for row_pair_counts in cursor.fetchall():
315+
concept_id_b, count_b, pair_count = row_pair_counts
316+
cc = calculations(count_a, count_b, pair_count, patient_count)
317+
sql_params.append((cc.p_value, cc.pair_count_ci[0], cc.pair_count_ci[1],
318+
cc.ln_ratio, cc.ln_ratio_ci[0], cc.ln_ratio_ci[1],
319+
cc.log_odds, cc.log_odds_ci[0], cc.log_odds_ci[1],
320+
dataset_id, concept_id_a, concept_id_b))
321+
n_b += 1
322+
logging.debug(f'concept_id_b: {concept_id_b}')
323+
logging.debug(cc)
324+
cur_update.executemany(sql_update, sql_params)
325+
326+
cursor.execute(sql_pair_counts_2, (dataset_id, concept_id_a,))
327+
sql_params = list()
328+
for row_pair_counts in cursor.fetchall():
329+
concept_id_b, count_b, pair_count = row_pair_counts
330+
cc = calculations(count_b, count_a, pair_count, patient_count)
331+
sql_params.append((cc.p_value, cc.pair_count_ci[0], cc.pair_count_ci[1],
332+
cc.ln_ratio, cc.ln_ratio_ci[0], cc.ln_ratio_ci[1],
333+
cc.log_odds, cc.log_odds_ci[0], cc.log_odds_ci[1],
334+
dataset_id, concept_id_b, concept_id_a))
335+
n_b += 1
336+
logging.debug(f'concept_id_b: {concept_id_b}')
337+
logging.debug(cc)
338+
cur_update.executemany(sql_update, sql_params)
339+
340+
connection.commit()
341+
342+
n_a += 1
343+
if n_a % 100 == 1 and n_b > 0:
344+
duration = (datetime.now() - t1).total_seconds() / 60 / 60
345+
est_speed = n_b / duration
346+
est_remain = (n_rows - n_b) / est_speed
347+
logging.info(
348+
f'{n_b} ({n_b / n_rows * 100}%) - {duration:0.1f} hours - {est_remain:0.1f} hours ({est_remain / 24:0.2f} days) remaining')
349+
350+
logging.info(f'{n_b} - {(datetime.now() - t1).total_seconds() / 60 / 60} hours')

0 commit comments

Comments
 (0)