Skip to content

Commit 016a9e7

Browse files
authored
Merge pull request #175 from WengLab-InformaticsResearch/precompute_stats
Precompute stats
2 parents c0753f5 + a44e83b commit 016a9e7

1 file changed

Lines changed: 281 additions & 4 deletions

File tree

cohd/query_cohd_mysql.py

Lines changed: 281 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from .omop_xref import xref_to_omop_standard_concept, omop_map_to_standard, omop_map_from_standard, \
88
xref_from_omop_standard_concept, xref_from_omop_local, xref_to_omop_local
9-
from .cohd_utilities import ln_ratio_ci, rel_freq_ci, log_odds
9+
from .cohd_utilities import ln_ratio_ci, rel_freq_ci, log_odds, clip
1010
from .app import cache
1111

1212
# Configuration
@@ -1206,7 +1206,7 @@ def query_association(method, concept_id_1, concept_id_2=None, dataset_id=None,
12061206

12071207

12081208
@cache.memoize(timeout=86400)
1209-
def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
1209+
def query_trapi_old(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
12101210
ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE):
12111211
""" Query for TRAPI. Performs the calculations for all association methods
12121212
@@ -1425,6 +1425,282 @@ def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None
14251425
return json_return
14261426

14271427

1428+
def get_total_pair_counts(dataset_id: int) -> int:
1429+
""" Returns total number of pairs of concepts in the given dataset
1430+
1431+
Params
1432+
------
1433+
dataset_id: (int) dataset id
1434+
1435+
Returns
1436+
-------
1437+
(int) total number of pairs of concepts
1438+
"""
1439+
if not get_total_pair_counts.total_pair_counts:
1440+
# Get the total pair counts and store locally for faster future retrieval
1441+
get_total_pair_counts.total_pair_counts = dict()
1442+
conn = sql_connection()
1443+
cur = conn.cursor()
1444+
sql = '''SELECT dataset_id, SUM(count) AS pair_count
1445+
FROM domain_pair_concept_counts
1446+
GROUP BY dataset_id;'''
1447+
cur.execute(sql)
1448+
results = cur.fetchall()
1449+
for result in results:
1450+
get_total_pair_counts.total_pair_counts[result['dataset_id']] = int(result['pair_count'])
1451+
1452+
return get_total_pair_counts.total_pair_counts[dataset_id]
1453+
get_total_pair_counts.total_pair_counts = None
1454+
1455+
1456+
@cache.memoize(timeout=86400)
1457+
def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
1458+
ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE):
1459+
""" Query for TRAPI. Performs the calculations for all association methods
1460+
1461+
Parameters
1462+
----------
1463+
concept_id_1: String - OMOP concept ID
1464+
concept_id_2: (optional) String - OMOP concept ID
1465+
dataset_id: (optional) String - COHD dataset ID
1466+
domain_id: (optional) String - OMOP domain ID
1467+
concept_class_id: (optional) String - OMOP concept class ID
1468+
ln_ratio_sign: (optional) Int - 1: positive ln_ratio only; -1: negative ln_ratio only; 0: any ln_ratio
1469+
confidence: (optional) Float - Confidence level
1470+
1471+
Returns
1472+
-------
1473+
Dict results
1474+
"""
1475+
assert concept_id_1 is not None and str(concept_id_1), \
1476+
'query_cohd_mysql.py::query_trapi() - Bad input. concept_id_1={concept_id_1}'.format(
1477+
concept_id_1=str(concept_id_1)
1478+
)
1479+
1480+
# Connect to MYSQL database
1481+
conn = sql_connection()
1482+
cur = conn.cursor()
1483+
1484+
# Get the total number of pairs for Bonferonni adjustment
1485+
pair_count = get_total_pair_counts(dataset_id)
1486+
1487+
# Filter ln ratio
1488+
if ln_ratio_sign == 0:
1489+
ln_ratio_filter = ''
1490+
elif ln_ratio_sign > 0:
1491+
ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) > 0'
1492+
elif ln_ratio_sign < 0:
1493+
ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) < 0'
1494+
1495+
if concept_id_2 is not None:
1496+
# concept_id_2 is specified, only return the results for the pair (concept_id_1, concept_id_2)
1497+
concept_id_2 = int(concept_id_2)
1498+
order = concept_id_1 < concept_id_2
1499+
sql = '''SELECT
1500+
cp.dataset_id,
1501+
cp.concept_id_1 AS {rename_id_1},
1502+
cp.concept_id_2 AS {rename_id_2},
1503+
c1.concept_count AS {rename_count_1},
1504+
c2.concept_count AS {rename_count_2},
1505+
cp.concept_count AS concept_pair_count,
1506+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1507+
p_value,
1508+
ln_ratio,
1509+
ln_ratio_ci_lo,
1510+
ln_ratio_ci_hi,
1511+
cp.concept_count / (c1.concept_count + 0E0) AS {rename_rf1},
1512+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS {rename_rf1_ci_lo},
1513+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS {rename_rf1_ci_hi},
1514+
cp.concept_count / (c2.concept_count + 0E0) AS {rename_rf2},
1515+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS {rename_rf2_ci_lo},
1516+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS {rename_rf2_ci_hi},
1517+
log_odds,
1518+
log_odds_ci_lo,
1519+
log_odds_ci_hi,
1520+
pc.count AS patient_count
1521+
FROM cohd.concept_pair_counts cp
1522+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1523+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1524+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1525+
WHERE cp.dataset_id = %(dataset_id)s
1526+
AND c1.dataset_id = %(dataset_id)s
1527+
AND c2.dataset_id = %(dataset_id)s
1528+
AND cp.concept_id_1 = %(concept_id_1)s
1529+
AND cp.concept_id_2 = %(concept_id_2)s
1530+
{ln_ratio_filter}
1531+
;'''
1532+
params = {
1533+
'dataset_id': dataset_id,
1534+
'concept_id_1': concept_id_1 if order else concept_id_2,
1535+
'concept_id_2': concept_id_2 if order else concept_id_1
1536+
}
1537+
rename_id_1 = 'concept_id_1'
1538+
rename_id_2 = 'concept_id_2'
1539+
rename_count_1 = 'concept_1_count'
1540+
rename_count_2 = 'concept_2_count'
1541+
rename_rf1 = 'relative_frequency_1'
1542+
rename_rf2 = 'relative_frequency_2'
1543+
rename_rf1_ci_lo = 'rf1_ci_lo'
1544+
rename_rf1_ci_hi = 'rf1_ci_hi'
1545+
rename_rf2_ci_lo = 'rf2_ci_lo'
1546+
rename_rf2_ci_hi = 'rf2_ci_hi'
1547+
1548+
if not order:
1549+
rename_id_1 = 'concept_id_2'
1550+
rename_id_2 = 'concept_id_1'
1551+
rename_count_1 = 'concept_2_count'
1552+
rename_count_2 = 'concept_1_count'
1553+
rename_rf1 = 'relative_frequency_2'
1554+
rename_rf2 = 'relative_frequency_1'
1555+
rename_rf1_ci_lo = 'rf2_ci_lo'
1556+
rename_rf1_ci_hi = 'rf2_ci_hi'
1557+
rename_rf2_ci_lo = 'rf1_ci_lo'
1558+
rename_rf2_ci_hi = 'rf1_ci_hi'
1559+
sql = sql.format(rename_id_1=rename_id_1, rename_id_2=rename_id_2, rename_count_1=rename_count_1,
1560+
rename_count_2=rename_count_2, rename_rf1=rename_rf1, rename_rf2=rename_rf2,
1561+
ln_ratio_filter=ln_ratio_filter,
1562+
rename_rf1_ci_lo=rename_rf1_ci_lo, rename_rf1_ci_hi=rename_rf1_ci_hi,
1563+
rename_rf2_ci_lo=rename_rf2_ci_lo, rename_rf2_ci_hi=rename_rf2_ci_hi)
1564+
1565+
else:
1566+
# If concept_id_2 is not specified, get results for all pairs that include concept_id_1
1567+
sql = '''SELECT *
1568+
FROM
1569+
((SELECT
1570+
cp.dataset_id,
1571+
cp.concept_id_1,
1572+
cp.concept_id_2,
1573+
c1.concept_count AS concept_1_count,
1574+
c2.concept_count AS concept_2_count,
1575+
cp.concept_count AS concept_pair_count,
1576+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1577+
p_value,
1578+
ln_ratio,
1579+
ln_ratio_ci_lo,
1580+
ln_ratio_ci_hi,
1581+
cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_1,
1582+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf1_ci_lo,
1583+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf1_ci_hi,
1584+
cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_2,
1585+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf2_ci_lo,
1586+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf2_ci_hi,
1587+
log_odds,
1588+
log_odds_ci_lo,
1589+
log_odds_ci_hi,
1590+
c.concept_name AS concept_2_name,
1591+
c.domain_id AS concept_2_domain,
1592+
c.concept_class_id AS concept_2_class_id,
1593+
pc.count AS patient_count
1594+
FROM cohd.concept_pair_counts cp
1595+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1596+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1597+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1598+
JOIN cohd.concept c ON cp.concept_id_2 = c.concept_id
1599+
WHERE cp.dataset_id = %(dataset_id)s
1600+
AND c1.dataset_id = %(dataset_id)s
1601+
AND c2.dataset_id = %(dataset_id)s
1602+
AND cp.concept_id_1 = %(concept_id_1)s
1603+
{domain_filter}
1604+
{concept_class_filter}
1605+
{ln_ratio_filter})
1606+
UNION
1607+
(SELECT
1608+
cp.dataset_id,
1609+
cp.concept_id_2 AS concept_id_1,
1610+
cp.concept_id_1 AS concept_id_2,
1611+
c2.concept_count AS concept_1_count,
1612+
c1.concept_count AS concept_2_count,
1613+
cp.concept_count AS concept_pair_count,
1614+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1615+
p_value,
1616+
ln_ratio,
1617+
ln_ratio_ci_lo,
1618+
ln_ratio_ci_hi,
1619+
cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_1,
1620+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf1_ci_lo,
1621+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf1_ci_hi,
1622+
cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_2,
1623+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf2_ci_lo,
1624+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf2_ci_hi,
1625+
log_odds,
1626+
log_odds_ci_lo,
1627+
log_odds_ci_hi,
1628+
c.concept_name AS concept_2_name,
1629+
c.domain_id AS concept_2_domain,
1630+
c.concept_class_id AS concept_2_class_id,
1631+
pc.count AS patient_count
1632+
FROM cohd.concept_pair_counts cp
1633+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1634+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1635+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1636+
JOIN cohd.concept c ON cp.concept_id_1 = c.concept_id
1637+
WHERE cp.dataset_id = %(dataset_id)s
1638+
AND c1.dataset_id = %(dataset_id)s
1639+
AND c2.dataset_id = %(dataset_id)s
1640+
AND cp.concept_id_2 = %(concept_id_1)s
1641+
{domain_filter}
1642+
{concept_class_filter}
1643+
{ln_ratio_filter})) x
1644+
ORDER BY ABS(ln_ratio) DESC;'''
1645+
params = {
1646+
'dataset_id': dataset_id,
1647+
'concept_id_1': concept_id_1,
1648+
}
1649+
1650+
if domain_id is not None and not domain_id == ['']:
1651+
# Restrict the associated concept by domain
1652+
domain_filter = 'AND c.domain_id = %(domain_id)s'
1653+
params['domain_id'] = domain_id
1654+
else:
1655+
# Unrestricted domain
1656+
domain_filter = ''
1657+
1658+
# Filter concepts by concept_class
1659+
if concept_class_id is None or not concept_class_id or concept_class_id == [''] or \
1660+
concept_class_id.isspace():
1661+
concept_class_filter = ''
1662+
else:
1663+
concept_class_filter = 'AND concept_class_id = %(concept_class_id)s'
1664+
params['concept_class_id'] = concept_class_id
1665+
1666+
sql = sql.format(domain_filter=domain_filter, concept_class_filter=concept_class_filter,
1667+
ln_ratio_filter=ln_ratio_filter)
1668+
1669+
cur.execute(sql, params)
1670+
json_return = cur.fetchall()
1671+
1672+
# Perform calculations for results
1673+
for row in json_return:
1674+
cpc = row['concept_pair_count']
1675+
c1 = row['concept_1_count']
1676+
c2 = row['concept_2_count']
1677+
1678+
# Confidence interval for obsExpRatio
1679+
# The CI bounds may hit Inf, which causes issues with JSON serialization. Limit it to 999
1680+
row['ln_ratio_ci'] = (clip(row['ln_ratio_ci_lo'], JSON_INFINITY_REPLACEMENT),
1681+
clip(row['ln_ratio_ci_hi'], JSON_INFINITY_REPLACEMENT))
1682+
1683+
# Confidence intervals for relative frequencies
1684+
row['relative_frequency_1_ci'] = row['rf1_ci_lo'], min(row['rf1_ci_hi'], JSON_INFINITY_REPLACEMENT)
1685+
row['relative_frequency_2_ci'] = row['rf2_ci_lo'], min(row['rf2_ci_hi'], JSON_INFINITY_REPLACEMENT)
1686+
1687+
# Chi-square
1688+
p_value = row['p_value']
1689+
row['chi_square_p-value'] = max(p_value, MIN_P)
1690+
row['chi_square_p-value_adjusted'] = max(min(p_value * pair_count, 1.0), MIN_P) # Bonferonni adjustment
1691+
1692+
# Log-odds
1693+
row['log_odds'] = clip(row['log_odds'], JSON_INFINITY_REPLACEMENT)
1694+
row['log_odds_ci'] = [clip(row['log_odds_ci_lo'], JSON_INFINITY_REPLACEMENT),
1695+
clip(row['log_odds_ci_hi'], JSON_INFINITY_REPLACEMENT)]
1696+
1697+
cur.close()
1698+
conn.close()
1699+
1700+
json_return = {"results": json_return}
1701+
return json_return
1702+
1703+
14281704
def health():
14291705
""" Quick health check of MySQL database
14301706
Simply queries the dataset table
@@ -1437,8 +1713,9 @@ def health():
14371713
cur = conn.cursor()
14381714

14391715
try:
1440-
sql = '''SELECT *
1441-
FROM cohd.dataset;'''
1716+
sql = '''SELECT dataset_id
1717+
FROM cohd.dataset
1718+
LIMIT 1;'''
14421719
cur.execute(sql)
14431720
datasets = cur.fetchall()
14441721
healthy = len(datasets) > 0

0 commit comments

Comments
 (0)