1+ # This code pasted together from prototyping notebook. Untested.
2+ from getpass import getpass
3+ from datetime import datetime
4+ from collections import namedtuple
5+ import sys
6+ import logging
7+
8+ import numpy as np
9+ import pandas as pd
10+ from scipy .stats import poisson , chisquare
11+ from sqlalchemy import create_engine
12+ import mysql .connector
13+
14+
15+ logFormatter = logging .Formatter ("%(asctime)s %(levelname)s %(module)s:%(lineno)d: %(message)s" )
16+ rootLogger = logging .getLogger ()
17+ rootLogger .handlers .clear ()
18+ fileHandler = logging .handlers .RotatingFileHandler (f"prototype.log" , maxBytes = 10e6 )
19+ fileHandler .setFormatter (logFormatter )
20+ fileHandler .setLevel (logging .INFO )
21+ rootLogger .addHandler (fileHandler )
22+ consoleHandler = logging .StreamHandler (sys .stdout )
23+ consoleHandler .setFormatter (logFormatter )
24+ consoleHandler .setLevel (logging .INFO )
25+ rootLogger .addHandler (consoleHandler )
26+ rootLogger .setLevel (logging .INFO )
27+
28+ pwd = getpass ()
29+ engine = create_engine (f'mysql+mysqlconnector://admin:{ pwd } @tr-kp-clinical-db.ncats.io/cohd' )
30+ conn = engine .connect ()
31+
32+ connection2 = mysql .connector .connect (host = 'tr-kp-clinical-db.ncats.io' ,
33+ database = 'cohd' ,
34+ user = 'admin' ,
35+ password = pwd )
36+
37+ sql = '''
38+ SELECT COUNT(*)
39+ FROM concept_counts
40+ WHERE dataset_id = %(dataset_id)s;
41+ '''
42+ for dataset_id in range (1 , 5 ):
43+ cur = conn .exec_driver_sql (sql , {'dataset_id' : dataset_id })
44+ n_rows = cur .fetchone ()[0 ]
45+ print (f'{ dataset_id } : { n_rows } rows' )
46+
47+ CohdCalculations = namedtuple ('CohdCalculations' , ['p_value' , 'pair_count_ci' , 'ln_ratio' , 'ln_ratio_ci' ,
48+ 'log_odds' , 'log_odds_ci' ])
49+
50+
51+ def poisson_ci (freq , confidence = 0.99 ):
52+ """ Assuming two Poisson processes (1 for the event rate and 1 for randomization), calculate the confidence interval
53+ for the true rate
54+
55+ Parameters
56+ ----------
57+ freq: float - co-occurrence frequency
58+ confidence: float - desired confidence. range: [0, 1]
59+
60+ Returns
61+ -------
62+ (lower bound, upper bound)
63+ """
64+ alpha = 1 - confidence
65+ ci = poisson .ppf ([alpha / 2 , 1 - alpha / 2 ], freq )
66+ ci [0 ] = max (ci [0 ], 1 ) # min possible count is 1
67+ ci = int (ci [0 ]), int (ci [1 ])
68+ return ci
69+
70+
71+ def double_poisson_ci (freq , confidence = 0.99 ):
72+ """ Assuming two Poisson processes (1 for the event rate and 1 for randomization), calculate the confidence interval
73+ for the true rate
74+
75+ Parameters
76+ ----------
77+ freq: float - co-occurrence frequency
78+ confidence: float - desired confidence. range: [0, 1]
79+
80+ Returns
81+ -------
82+ (lower bound, upper bound)
83+ """
84+ # # Adjust the interval for each individual poisson to achieve overall confidence interval
85+ # confidence_adjusted = 1 - (1 - confidence) ** 0.5
86+ # return (poisson.interval(confidence_adjusted, poisson.interval(confidence_adjusted, freq)[0])[0],
87+ # poisson.interval(confidence_adjusted, poisson.interval(confidence_adjusted, freq)[1])[1])
88+
89+ # More efficient calculation using a single call to poisson.interval with similar results as above
90+ # Adjust the interval for each individual poisson to achieve overall confidence interval
91+ confidence_adjusted = 1 - ((1 - confidence ) ** 1.5 )
92+ return poisson_ci (freq , confidence_adjusted )
93+
94+
95+ def ln_ratio_ci (freq , ln_ratio , confidence = 0.99 , replace_inf = None ):
96+ """ Estimates the confidence interval of the log ratio using the double poisson method
97+
98+ Parameters
99+ ----------
100+ freq: float - co-occurrence count
101+ ln_ratio: float - log ratio
102+ confidence: float - desired confidence. range: [0, 1]
103+ replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
104+ doesn't allow Infinity)
105+
106+ Returns
107+ -------
108+ (lower bound, upper bound)
109+ """
110+ # Convert ln_ratio back to ratio and calculate confidence intervals for the ratios
111+ ci = tuple (np .log (np .array (double_poisson_ci (freq , confidence )) * np .exp (ln_ratio ) / freq ))
112+ if replace_inf :
113+ ci = max (ci [0 ], - replace_inf ), min (ci [1 ], replace_inf )
114+ return ci
115+
116+
117+ def rel_freq_cis (pair_count , count_1 , count_2 , confidence = 0.99 ):
118+ """ Estimates the confidence interval of the relative frequency using the double poisson method
119+
120+ Parameters
121+ ----------
122+ pair_count: int - co-occurrence count
123+ base_count: int - base concept count
124+ confidence: float - desired confidence. range: [0, 1]
125+ replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
126+ doesn't allow Infinity)
127+
128+ Returns
129+ -------
130+ (lower bound, upper bound)
131+ """
132+ pair_count_ci = poisson_ci (pair_count , confidence )
133+ count_1_ci = poisson_ci (count_1 , confidence )
134+ count_2_ci = poisson_ci (count_2 , confidence )
135+ ci_1 = pair_count_ci [0 ] / count_1_ci [1 ], pair_count_ci [1 ] / count_1_ci [0 ]
136+ ci_2 = pair_count_ci [0 ] / count_2_ci [1 ], pair_count_ci [1 ] / count_2_ci [0 ]
137+ return ci_1 , ci_2
138+
139+
140+ def log_odds (c1 , c2 , cp , n , replace_inf = np .inf ):
141+ """ Calculates the log-odds and 95% CI
142+
143+ Params
144+ ------
145+ c1: count for concept 1
146+ c2: count for concept 2
147+ cp: concept-pair count
148+ n: total population size
149+ replace_inf: (Optional) If specified, replaces +Inf or -Inf with +replace_inf or -replace_inf (useful because JSON
150+ doesn't allow Infinity)
151+
152+ Returns
153+ -------
154+ (log-odds, [95% CI lower bound, 95% CI upper bound])
155+ """
156+ a = cp
157+ b = c1 - cp
158+ c = c2 - cp
159+ d = n - c1 - c2 + cp
160+ # print('--------------')
161+ # print(c1, c2, cp, n)
162+ # print(a, b, c, d)
163+ # Check b/c <= 0 since Poisson perturbation can cause b or c to be negative
164+ if b <= 0 or c <= 0 :
165+ if a == 0 :
166+ return 0 , [0 , 0 ]
167+ else :
168+ return replace_inf , [replace_inf , replace_inf ]
169+ else :
170+ log_odds = np .log ((a * d ) / (b * c ))
171+ ci = 1.96 * np .sqrt (1 / a + 1 / b + 1 / c + 1 / d )
172+ # Strict JSON doesn't allow Inf values, replace as necessary
173+ ci = [log_odds - ci , log_odds + ci ]
174+ return log_odds , ci
175+
176+
177+ def chi (c1 , c2 , cpc , pts ):
178+ """ Calculates chi-square p-value
179+
180+ Params
181+ ------
182+ c1: count for concept 1
183+ c2: count for concept 2
184+ cpc: concept-pair count
185+ pts: total population size
186+
187+ Returns
188+ -------
189+ chi-square -value (unadjusted)
190+ """
191+ neg = pts - c1 - c2 + cpc
192+ # Create the observed and expected RxC tables and perform chi-square
193+ o = [neg , c1 - cpc , c2 - cpc , cpc ]
194+ e = [(pts - c1 ) * (pts - c2 ) / pts , c1 * (pts - c2 ) / pts , c2 * (pts - c1 ) / pts , c1 * c2 / pts ]
195+ cs = chisquare (o , e , 2 )
196+ return cs .pvalue
197+
198+
199+ def calculations (count_1 , count_2 , pair_count , patient_count ):
200+ count_1 = float (count_1 )
201+ count_2 = float (count_2 )
202+ pair_count = float (pair_count )
203+ patient_count = float (patient_count )
204+ p = chi (count_1 , count_2 , pair_count , patient_count )
205+ pair_count_ci = poisson_ci (pair_count , confidence = 0.99 )
206+ ln_ratio = np .log (pair_count * patient_count / (count_1 * count_2 ))
207+ lr_ci = ln_ratio_ci (pair_count , ln_ratio , confidence = 0.99 )
208+ # rf1_ci, rf2_ci = rel_freq_cis(pair_count, count_1, count_2, confidence=0.99)
209+ lo , lo_ci = log_odds (count_1 , count_2 , pair_count , patient_count , )
210+ return CohdCalculations (p , pair_count_ci , ln_ratio , lr_ci , lo , lo_ci )
211+
212+ ### Update concept_counts table ###
213+
214+ sql_fetch = '''
215+ SELECT concept_id, concept_count
216+ FROM concept_counts
217+ WHERE dataset_id = %(dataset_id)s
218+ '''
219+
220+ sql_update = '''
221+ UPDATE concept_counts
222+ SET ci_lo = %s, ci_hi = %s
223+ WHERE dataset_id = %s AND concept_id = %s;
224+ '''
225+ cur_update = connection2 .cursor (prepared = True )
226+
227+ CONFIDENCE = 0.99
228+ dataset_ids = [1 , 2 , 3 , 4 ]
229+ for dataset_id in dataset_ids :
230+ t1 = datetime .now ()
231+ cur_fetch = conn .exec_driver_sql (sql_fetch , {'dataset_id' : dataset_id })
232+ count = 0
233+ params_list = list ()
234+ for row in cur_fetch :
235+ concept_id , concept_count = row
236+ ci_lo , ci_hi = poisson_ci (concept_count , CONFIDENCE )
237+ params_list .append ((ci_lo , ci_hi , dataset_id , concept_id ))
238+
239+ count += 1
240+ if count % 5000 == 0 :
241+ print (count )
242+
243+ cur_update .executemany (sql_update , params_list )
244+ connection2 .commit ()
245+
246+ delta = datetime .now () - t1
247+ print (f'{ delta .total_seconds ()} seconds' )
248+
249+ ### Update concept_pair_counts table ###
250+
251+ connection = mysql .connector .connect (host = 'tr-kp-clinical-db.ncats.io' ,
252+ database = 'cohd' ,
253+ user = 'admin' ,
254+ password = pwd )
255+ cursor = connection .cursor ()
256+
257+ sql_patients = '''
258+ SELECT count
259+ FROM patient_count
260+ WHERE dataset_id = %s;
261+ '''
262+
263+ sql_counts = '''
264+ SELECT concept_id, concept_count
265+ FROM concept_counts
266+ WHERE dataset_id = %s
267+ ORDER BY concept_id ASC;
268+ '''
269+
270+ # splitting up the query into 2 queries looking for concept_id_1 and concept_id_2 separately is MUCH faster
271+ sql_pair_counts_1 = '''
272+ SELECT cc.concept_id, cc.concept_count, cpc.concept_count AS pair_count
273+ FROM concept_pair_counts cpc
274+ JOIN concept_counts cc ON cpc.dataset_id = cc.dataset_id AND cpc.concept_id_2 = cc.concept_id
275+ WHERE cpc.dataset_id = %s AND concept_id_1 = %s AND cpc.p_value IS NULL
276+ '''
277+
278+ sql_pair_counts_2 = '''
279+ SELECT cc.concept_id, cc.concept_count, cpc.concept_count AS pair_count
280+ FROM concept_pair_counts cpc
281+ JOIN concept_counts cc ON cpc.dataset_id = cc.dataset_id AND cpc.concept_id_1 = cc.concept_id
282+ WHERE cpc.dataset_id = %s AND concept_id_2 = %s AND cpc.p_value IS NULL
283+ '''
284+
285+ sql_update = '''
286+ UPDATE concept_pair_counts
287+ SET p_value = %s, pair_count_ci_lo = %s, pair_count_ci_hi = %s,
288+ ln_ratio = %s, ln_ratio_ci_lo = %s, ln_ratio_ci_hi = %s,
289+ log_odds = %s, log_odds_ci_lo = %s, log_odds_ci_hi = %s
290+ WHERE dataset_id = %s AND concept_id_1 = %s AND concept_id_2 = %s;
291+ '''
292+ cur_update = connection .cursor (prepared = True )
293+
294+ n_rows_dataset = [15927195 , 32788901 , 197683321 , 56848043 ]
295+ datasets = [2 , 4 ]
296+ for dataset_id in datasets :
297+ n_rows = n_rows_dataset [dataset_id - 1 ]
298+ logging .info (f'######## dataset_id { dataset_id } - { n_rows } rows ########' )
299+
300+ # Get patient count
301+ cursor .execute (sql_patients , (dataset_id ,))
302+ patient_count = float (cursor .fetchall ()[0 ][0 ])
303+
304+ cursor .execute (sql_counts , (dataset_id ,))
305+ n_a = 0
306+ n_b = 0
307+ t1 = datetime .now ()
308+ for row_counts in cursor .fetchall ():
309+ concept_id_a , count_a = row_counts
310+ logging .debug (f'concept_id_a: { concept_id_a } ' )
311+
312+ cursor .execute (sql_pair_counts_1 , (dataset_id , concept_id_a ,))
313+ sql_params = list ()
314+ for row_pair_counts in cursor .fetchall ():
315+ concept_id_b , count_b , pair_count = row_pair_counts
316+ cc = calculations (count_a , count_b , pair_count , patient_count )
317+ sql_params .append ((cc .p_value , cc .pair_count_ci [0 ], cc .pair_count_ci [1 ],
318+ cc .ln_ratio , cc .ln_ratio_ci [0 ], cc .ln_ratio_ci [1 ],
319+ cc .log_odds , cc .log_odds_ci [0 ], cc .log_odds_ci [1 ],
320+ dataset_id , concept_id_a , concept_id_b ))
321+ n_b += 1
322+ logging .debug (f'concept_id_b: { concept_id_b } ' )
323+ logging .debug (cc )
324+ cur_update .executemany (sql_update , sql_params )
325+
326+ cursor .execute (sql_pair_counts_2 , (dataset_id , concept_id_a ,))
327+ sql_params = list ()
328+ for row_pair_counts in cursor .fetchall ():
329+ concept_id_b , count_b , pair_count = row_pair_counts
330+ cc = calculations (count_b , count_a , pair_count , patient_count )
331+ sql_params .append ((cc .p_value , cc .pair_count_ci [0 ], cc .pair_count_ci [1 ],
332+ cc .ln_ratio , cc .ln_ratio_ci [0 ], cc .ln_ratio_ci [1 ],
333+ cc .log_odds , cc .log_odds_ci [0 ], cc .log_odds_ci [1 ],
334+ dataset_id , concept_id_b , concept_id_a ))
335+ n_b += 1
336+ logging .debug (f'concept_id_b: { concept_id_b } ' )
337+ logging .debug (cc )
338+ cur_update .executemany (sql_update , sql_params )
339+
340+ connection .commit ()
341+
342+ n_a += 1
343+ if n_a % 100 == 1 and n_b > 0 :
344+ duration = (datetime .now () - t1 ).total_seconds () / 60 / 60
345+ est_speed = n_b / duration
346+ est_remain = (n_rows - n_b ) / est_speed
347+ logging .info (
348+ f'{ n_b } ({ n_b / n_rows * 100 } %) - { duration :0.1f} hours - { est_remain :0.1f} hours ({ est_remain / 24 :0.2f} days) remaining' )
349+
350+ logging .info (f'{ n_b } - { (datetime .now () - t1 ).total_seconds () / 60 / 60 } hours' )
0 commit comments