Skip to content

Commit 892d123

Browse files
committed
Merge remote-tracking branch 'origin/master' into node_norm_update
2 parents f6c53f3 + 5c7c20f commit 892d123

10 files changed

Lines changed: 664 additions & 94 deletions

File tree

.github/workflows/cohd-monitor.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ jobs:
2424
id: dependencies
2525
run: |
2626
python -m pip install --upgrade pip
27-
pip install pytest requests pandas matplotlib linkml-runtime bmt reasoner-validator flask flask-cors flask-caching
27+
pip install pytest requests pandas matplotlib linkml-runtime bmt reasoner-validator>=3.8.0 flask flask-cors flask-caching
2828
pip show reasoner-validator
2929
3030
# Run /health checks on all COHD instances (dev, CI, Test, Prod)

cohd/TRAPI14.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ info:
4141
url: https://github.com/NCATSTranslator/translator_extensions/blob/\
4242
production/x-translator/
4343
x-trapi:
44-
version: 1.4.2
44+
version: 1.4.0
4545
asyncquery: false
4646
operations:
4747
- lookup

cohd/biolink_mapper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ def prefetch_mappings():
150150
mapping_rows = cur.fetchall()
151151
BiolinkConceptMapper._map_omop = {r['omop_id']:r for r in mapping_rows}
152152
BiolinkConceptMapper._map_biolink = {r['biolink_id']:r for r in mapping_rows if r['preferred']}
153+
logging.info('Biolink mappings prefetch completed')
153154

154155
@staticmethod
155156
def map_to_omop(curies: List[str]) -> Optional[Tuple[Dict[str, Optional[OmopBiolinkMapping]], Dict[str, Optional[NormalizedNode]]]]:

cohd/cohd_trapi_14.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class CohdTrapi140(CohdTrapi):
3737
edge_types_negative = ['biolink:negatively_correlated_with']
3838
default_negative_predicate = edge_types_negative[0]
3939

40-
tool_version = f'{CohdTrapi._SERVICE_NAME} 6.3.6'
40+
tool_version = f'{CohdTrapi._SERVICE_NAME} 6.3.8'
4141
schema_version = '1.4.2'
4242
biolink_version = bm_version
4343

cohd/query_cohd_mysql.py

Lines changed: 281 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
from .omop_xref import xref_to_omop_standard_concept, omop_map_to_standard, omop_map_from_standard, \
88
xref_from_omop_standard_concept, xref_from_omop_local, xref_to_omop_local
9-
from .cohd_utilities import ln_ratio_ci, rel_freq_ci, log_odds
9+
from .cohd_utilities import ln_ratio_ci, rel_freq_ci, log_odds, clip
1010
from .app import cache
1111

1212
# Configuration
@@ -1206,7 +1206,7 @@ def query_association(method, concept_id_1, concept_id_2=None, dataset_id=None,
12061206

12071207

12081208
@cache.memoize(timeout=86400)
1209-
def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
1209+
def query_trapi_old(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
12101210
ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE):
12111211
""" Query for TRAPI. Performs the calculations for all association methods
12121212
@@ -1425,6 +1425,282 @@ def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None
14251425
return json_return
14261426

14271427

1428+
def get_total_pair_counts(dataset_id: int) -> int:
1429+
""" Returns total number of pairs of concepts in the given dataset
1430+
1431+
Params
1432+
------
1433+
dataset_id: (int) dataset id
1434+
1435+
Returns
1436+
-------
1437+
(int) total number of pairs of concepts
1438+
"""
1439+
if not get_total_pair_counts.total_pair_counts:
1440+
# Get the total pair counts and store locally for faster future retrieval
1441+
get_total_pair_counts.total_pair_counts = dict()
1442+
conn = sql_connection()
1443+
cur = conn.cursor()
1444+
sql = '''SELECT dataset_id, SUM(count) AS pair_count
1445+
FROM domain_pair_concept_counts
1446+
GROUP BY dataset_id;'''
1447+
cur.execute(sql)
1448+
results = cur.fetchall()
1449+
for result in results:
1450+
get_total_pair_counts.total_pair_counts[result['dataset_id']] = int(result['pair_count'])
1451+
1452+
return get_total_pair_counts.total_pair_counts[dataset_id]
1453+
get_total_pair_counts.total_pair_counts = None
1454+
1455+
1456+
@cache.memoize(timeout=86400)
1457+
def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None, concept_class_id=None,
1458+
ln_ratio_sign=0, confidence=DEFAULT_CONFIDENCE):
1459+
""" Query for TRAPI. Performs the calculations for all association methods
1460+
1461+
Parameters
1462+
----------
1463+
concept_id_1: String - OMOP concept ID
1464+
concept_id_2: (optional) String - OMOP concept ID
1465+
dataset_id: (optional) String - COHD dataset ID
1466+
domain_id: (optional) String - OMOP domain ID
1467+
concept_class_id: (optional) String - OMOP concept class ID
1468+
ln_ratio_sign: (optional) Int - 1: positive ln_ratio only; -1: negative ln_ratio only; 0: any ln_ratio
1469+
confidence: (optional) Float - Confidence level
1470+
1471+
Returns
1472+
-------
1473+
Dict results
1474+
"""
1475+
assert concept_id_1 is not None and str(concept_id_1), \
1476+
'query_cohd_mysql.py::query_trapi() - Bad input. concept_id_1={concept_id_1}'.format(
1477+
concept_id_1=str(concept_id_1)
1478+
)
1479+
1480+
# Connect to MYSQL database
1481+
conn = sql_connection()
1482+
cur = conn.cursor()
1483+
1484+
# Get the total number of pairs for Bonferonni adjustment
1485+
pair_count = get_total_pair_counts(dataset_id)
1486+
1487+
# Filter ln ratio
1488+
if ln_ratio_sign == 0:
1489+
ln_ratio_filter = ''
1490+
elif ln_ratio_sign > 0:
1491+
ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) > 0'
1492+
elif ln_ratio_sign < 0:
1493+
ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) < 0'
1494+
1495+
if concept_id_2 is not None:
1496+
# concept_id_2 is specified, only return the results for the pair (concept_id_1, concept_id_2)
1497+
concept_id_2 = int(concept_id_2)
1498+
order = concept_id_1 < concept_id_2
1499+
sql = '''SELECT
1500+
cp.dataset_id,
1501+
cp.concept_id_1 AS {rename_id_1},
1502+
cp.concept_id_2 AS {rename_id_2},
1503+
c1.concept_count AS {rename_count_1},
1504+
c2.concept_count AS {rename_count_2},
1505+
cp.concept_count AS concept_pair_count,
1506+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1507+
p_value,
1508+
ln_ratio,
1509+
ln_ratio_ci_lo,
1510+
ln_ratio_ci_hi,
1511+
cp.concept_count / (c1.concept_count + 0E0) AS {rename_rf1},
1512+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS {rename_rf1_ci_lo},
1513+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS {rename_rf1_ci_hi},
1514+
cp.concept_count / (c2.concept_count + 0E0) AS {rename_rf2},
1515+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS {rename_rf2_ci_lo},
1516+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS {rename_rf2_ci_hi},
1517+
log_odds,
1518+
log_odds_ci_lo,
1519+
log_odds_ci_hi,
1520+
pc.count AS patient_count
1521+
FROM cohd.concept_pair_counts cp
1522+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1523+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1524+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1525+
WHERE cp.dataset_id = %(dataset_id)s
1526+
AND c1.dataset_id = %(dataset_id)s
1527+
AND c2.dataset_id = %(dataset_id)s
1528+
AND cp.concept_id_1 = %(concept_id_1)s
1529+
AND cp.concept_id_2 = %(concept_id_2)s
1530+
{ln_ratio_filter}
1531+
;'''
1532+
params = {
1533+
'dataset_id': dataset_id,
1534+
'concept_id_1': concept_id_1 if order else concept_id_2,
1535+
'concept_id_2': concept_id_2 if order else concept_id_1
1536+
}
1537+
rename_id_1 = 'concept_id_1'
1538+
rename_id_2 = 'concept_id_2'
1539+
rename_count_1 = 'concept_1_count'
1540+
rename_count_2 = 'concept_2_count'
1541+
rename_rf1 = 'relative_frequency_1'
1542+
rename_rf2 = 'relative_frequency_2'
1543+
rename_rf1_ci_lo = 'rf1_ci_lo'
1544+
rename_rf1_ci_hi = 'rf1_ci_hi'
1545+
rename_rf2_ci_lo = 'rf2_ci_lo'
1546+
rename_rf2_ci_hi = 'rf2_ci_hi'
1547+
1548+
if not order:
1549+
rename_id_1 = 'concept_id_2'
1550+
rename_id_2 = 'concept_id_1'
1551+
rename_count_1 = 'concept_2_count'
1552+
rename_count_2 = 'concept_1_count'
1553+
rename_rf1 = 'relative_frequency_2'
1554+
rename_rf2 = 'relative_frequency_1'
1555+
rename_rf1_ci_lo = 'rf2_ci_lo'
1556+
rename_rf1_ci_hi = 'rf2_ci_hi'
1557+
rename_rf2_ci_lo = 'rf1_ci_lo'
1558+
rename_rf2_ci_hi = 'rf1_ci_hi'
1559+
sql = sql.format(rename_id_1=rename_id_1, rename_id_2=rename_id_2, rename_count_1=rename_count_1,
1560+
rename_count_2=rename_count_2, rename_rf1=rename_rf1, rename_rf2=rename_rf2,
1561+
ln_ratio_filter=ln_ratio_filter,
1562+
rename_rf1_ci_lo=rename_rf1_ci_lo, rename_rf1_ci_hi=rename_rf1_ci_hi,
1563+
rename_rf2_ci_lo=rename_rf2_ci_lo, rename_rf2_ci_hi=rename_rf2_ci_hi)
1564+
1565+
else:
1566+
# If concept_id_2 is not specified, get results for all pairs that include concept_id_1
1567+
sql = '''SELECT *
1568+
FROM
1569+
((SELECT
1570+
cp.dataset_id,
1571+
cp.concept_id_1,
1572+
cp.concept_id_2,
1573+
c1.concept_count AS concept_1_count,
1574+
c2.concept_count AS concept_2_count,
1575+
cp.concept_count AS concept_pair_count,
1576+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1577+
p_value,
1578+
ln_ratio,
1579+
ln_ratio_ci_lo,
1580+
ln_ratio_ci_hi,
1581+
cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_1,
1582+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf1_ci_lo,
1583+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf1_ci_hi,
1584+
cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_2,
1585+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf2_ci_lo,
1586+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf2_ci_hi,
1587+
log_odds,
1588+
log_odds_ci_lo,
1589+
log_odds_ci_hi,
1590+
c.concept_name AS concept_2_name,
1591+
c.domain_id AS concept_2_domain,
1592+
c.concept_class_id AS concept_2_class_id,
1593+
pc.count AS patient_count
1594+
FROM cohd.concept_pair_counts cp
1595+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1596+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1597+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1598+
JOIN cohd.concept c ON cp.concept_id_2 = c.concept_id
1599+
WHERE cp.dataset_id = %(dataset_id)s
1600+
AND c1.dataset_id = %(dataset_id)s
1601+
AND c2.dataset_id = %(dataset_id)s
1602+
AND cp.concept_id_1 = %(concept_id_1)s
1603+
{domain_filter}
1604+
{concept_class_filter}
1605+
{ln_ratio_filter})
1606+
UNION
1607+
(SELECT
1608+
cp.dataset_id,
1609+
cp.concept_id_2 AS concept_id_1,
1610+
cp.concept_id_1 AS concept_id_2,
1611+
c2.concept_count AS concept_1_count,
1612+
c1.concept_count AS concept_2_count,
1613+
cp.concept_count AS concept_pair_count,
1614+
c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1615+
p_value,
1616+
ln_ratio,
1617+
ln_ratio_ci_lo,
1618+
ln_ratio_ci_hi,
1619+
cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_1,
1620+
cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf1_ci_lo,
1621+
cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf1_ci_hi,
1622+
cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_2,
1623+
cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf2_ci_lo,
1624+
cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf2_ci_hi,
1625+
log_odds,
1626+
log_odds_ci_lo,
1627+
log_odds_ci_hi,
1628+
c.concept_name AS concept_2_name,
1629+
c.domain_id AS concept_2_domain,
1630+
c.concept_class_id AS concept_2_class_id,
1631+
pc.count AS patient_count
1632+
FROM cohd.concept_pair_counts cp
1633+
JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1634+
JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1635+
JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1636+
JOIN cohd.concept c ON cp.concept_id_1 = c.concept_id
1637+
WHERE cp.dataset_id = %(dataset_id)s
1638+
AND c1.dataset_id = %(dataset_id)s
1639+
AND c2.dataset_id = %(dataset_id)s
1640+
AND cp.concept_id_2 = %(concept_id_1)s
1641+
{domain_filter}
1642+
{concept_class_filter}
1643+
{ln_ratio_filter})) x
1644+
ORDER BY ABS(ln_ratio) DESC;'''
1645+
params = {
1646+
'dataset_id': dataset_id,
1647+
'concept_id_1': concept_id_1,
1648+
}
1649+
1650+
if domain_id is not None and not domain_id == ['']:
1651+
# Restrict the associated concept by domain
1652+
domain_filter = 'AND c.domain_id = %(domain_id)s'
1653+
params['domain_id'] = domain_id
1654+
else:
1655+
# Unrestricted domain
1656+
domain_filter = ''
1657+
1658+
# Filter concepts by concept_class
1659+
if concept_class_id is None or not concept_class_id or concept_class_id == [''] or \
1660+
concept_class_id.isspace():
1661+
concept_class_filter = ''
1662+
else:
1663+
concept_class_filter = 'AND concept_class_id = %(concept_class_id)s'
1664+
params['concept_class_id'] = concept_class_id
1665+
1666+
sql = sql.format(domain_filter=domain_filter, concept_class_filter=concept_class_filter,
1667+
ln_ratio_filter=ln_ratio_filter)
1668+
1669+
cur.execute(sql, params)
1670+
json_return = cur.fetchall()
1671+
1672+
# Perform calculations for results
1673+
for row in json_return:
1674+
cpc = row['concept_pair_count']
1675+
c1 = row['concept_1_count']
1676+
c2 = row['concept_2_count']
1677+
1678+
# Confidence interval for obsExpRatio
1679+
# The CI bounds may hit Inf, which causes issues with JSON serialization. Limit it to 999
1680+
row['ln_ratio_ci'] = (clip(row['ln_ratio_ci_lo'], JSON_INFINITY_REPLACEMENT),
1681+
clip(row['ln_ratio_ci_hi'], JSON_INFINITY_REPLACEMENT))
1682+
1683+
# Confidence intervals for relative frequencies
1684+
row['relative_frequency_1_ci'] = row['rf1_ci_lo'], min(row['rf1_ci_hi'], JSON_INFINITY_REPLACEMENT)
1685+
row['relative_frequency_2_ci'] = row['rf2_ci_lo'], min(row['rf2_ci_hi'], JSON_INFINITY_REPLACEMENT)
1686+
1687+
# Chi-square
1688+
p_value = row['p_value']
1689+
row['chi_square_p-value'] = max(p_value, MIN_P)
1690+
row['chi_square_p-value_adjusted'] = max(min(p_value * pair_count, 1.0), MIN_P) # Bonferonni adjustment
1691+
1692+
# Log-odds
1693+
row['log_odds'] = clip(row['log_odds'], JSON_INFINITY_REPLACEMENT)
1694+
row['log_odds_ci'] = [clip(row['log_odds_ci_lo'], JSON_INFINITY_REPLACEMENT),
1695+
clip(row['log_odds_ci_hi'], JSON_INFINITY_REPLACEMENT)]
1696+
1697+
cur.close()
1698+
conn.close()
1699+
1700+
json_return = {"results": json_return}
1701+
return json_return
1702+
1703+
14281704
def health():
14291705
""" Quick health check of MySQL database
14301706
Simply queries the dataset table
@@ -1437,8 +1713,9 @@ def health():
14371713
cur = conn.cursor()
14381714

14391715
try:
1440-
sql = '''SELECT *
1441-
FROM cohd.dataset;'''
1716+
sql = '''SELECT dataset_id
1717+
FROM cohd.dataset
1718+
LIMIT 1;'''
14421719
cur.execute(sql)
14431720
datasets = cur.fetchall()
14441721
healthy = len(datasets) > 0

cohd/scheduled_tasks.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,20 @@ def task_build_cache():
1010
print('Running scheduled task to build cache')
1111
BiolinkConceptMapper.build_mappings()
1212

13+
# Schedule a task to update the Biolink Mapping cache nightly (all environments)
14+
scheduler = BackgroundScheduler()
15+
scheduler.add_job(func=BiolinkConceptMapper.prefetch_mappings, trigger='cron', hour=6)
1316

1417
# Schedule a task to build the cache every first Saturday of the month (in ITRB-CI and Dev only)
1518
deployment_env = app.config.get('DEPLOYMENT_ENV', 'dev').lower()
16-
if deployment_env in ('itrb-ci', 'dev'):
17-
scheduler = BackgroundScheduler()
18-
scheduler.add_job(func=task_build_cache, trigger='cron', day='1st sat', hour=0)
19-
scheduler.start()
19+
if False:
20+
scheduler.add_job(func=task_build_cache, trigger='cron', day='1st sat', hour=0)
2021
logging.info(f'Background task scheduled to build Biolink mappings (env: {deployment_env})')
2122
else:
2223
logging.info(f'Background task NOT scheduled to build Biolink mappings (env: {deployment_env})')
2324

25+
scheduler.start()
26+
2427

2528
# Registering a shutdown seems to cause UWSGI to have issues shutting down COHD, even with the wait=False option.
2629
# Since the default job store does not persist jobs anyway, remove this line for now

cohd/trapi/reasoner_validator_ext.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from yaml import CLoader as Loader
88
except ImportError:
99
from yaml import Loader
10-
from reasoner_validator import TRAPIResponseValidator
10+
from reasoner_validator.validator import TRAPIResponseValidator
1111
from reasoner_validator.trapi import TRAPISchemaValidator, openapi_to_jsonschema
1212

1313

0 commit comments

Comments
 (0)