66
77from .omop_xref import xref_to_omop_standard_concept , omop_map_to_standard , omop_map_from_standard , \
88 xref_from_omop_standard_concept , xref_from_omop_local , xref_to_omop_local
9- from .cohd_utilities import ln_ratio_ci , rel_freq_ci , log_odds
9+ from .cohd_utilities import ln_ratio_ci , rel_freq_ci , log_odds , clip
1010from .app import cache
1111
1212# Configuration
@@ -1206,7 +1206,7 @@ def query_association(method, concept_id_1, concept_id_2=None, dataset_id=None,
12061206
12071207
12081208@cache .memoize (timeout = 86400 )
1209- def query_trapi (concept_id_1 , concept_id_2 = None , dataset_id = None , domain_id = None , concept_class_id = None ,
1209+ def query_trapi_old (concept_id_1 , concept_id_2 = None , dataset_id = None , domain_id = None , concept_class_id = None ,
12101210 ln_ratio_sign = 0 , confidence = DEFAULT_CONFIDENCE ):
12111211 """ Query for TRAPI. Performs the calculations for all association methods
12121212
@@ -1425,6 +1425,282 @@ def query_trapi(concept_id_1, concept_id_2=None, dataset_id=None, domain_id=None
14251425 return json_return
14261426
14271427
1428+ def get_total_pair_counts (dataset_id : int ) -> int :
1429+ """ Returns total number of pairs of concepts in the given dataset
1430+
1431+ Params
1432+ ------
1433+ dataset_id: (int) dataset id
1434+
1435+ Returns
1436+ -------
1437+ (int) total number of pairs of concepts
1438+ """
1439+ if not get_total_pair_counts .total_pair_counts :
1440+ # Get the total pair counts and store locally for faster future retrieval
1441+ get_total_pair_counts .total_pair_counts = dict ()
1442+ conn = sql_connection ()
1443+ cur = conn .cursor ()
1444+ sql = '''SELECT dataset_id, SUM(count) AS pair_count
1445+ FROM domain_pair_concept_counts
1446+ GROUP BY dataset_id;'''
1447+ cur .execute (sql )
1448+ results = cur .fetchall ()
1449+ for result in results :
1450+ get_total_pair_counts .total_pair_counts [result ['dataset_id' ]] = int (result ['pair_count' ])
1451+
1452+ return get_total_pair_counts .total_pair_counts [dataset_id ]
1453+ get_total_pair_counts .total_pair_counts = None
1454+
1455+
1456+ @cache .memoize (timeout = 86400 )
1457+ def query_trapi (concept_id_1 , concept_id_2 = None , dataset_id = None , domain_id = None , concept_class_id = None ,
1458+ ln_ratio_sign = 0 , confidence = DEFAULT_CONFIDENCE ):
1459+ """ Query for TRAPI. Performs the calculations for all association methods
1460+
1461+ Parameters
1462+ ----------
1463+ concept_id_1: String - OMOP concept ID
1464+ concept_id_2: (optional) String - OMOP concept ID
1465+ dataset_id: (optional) String - COHD dataset ID
1466+ domain_id: (optional) String - OMOP domain ID
1467+ concept_class_id: (optional) String - OMOP concept class ID
1468+ ln_ratio_sign: (optional) Int - 1: positive ln_ratio only; -1: negative ln_ratio only; 0: any ln_ratio
1469+ confidence: (optional) Float - Confidence level
1470+
1471+ Returns
1472+ -------
1473+ Dict results
1474+ """
1475+ assert concept_id_1 is not None and str (concept_id_1 ), \
1476+ 'query_cohd_mysql.py::query_trapi() - Bad input. concept_id_1={concept_id_1}' .format (
1477+ concept_id_1 = str (concept_id_1 )
1478+ )
1479+
1480+ # Connect to MYSQL database
1481+ conn = sql_connection ()
1482+ cur = conn .cursor ()
1483+
1484+ # Get the total number of pairs for Bonferonni adjustment
1485+ pair_count = get_total_pair_counts (dataset_id )
1486+
1487+ # Filter ln ratio
1488+ if ln_ratio_sign == 0 :
1489+ ln_ratio_filter = ''
1490+ elif ln_ratio_sign > 0 :
1491+ ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) > 0'
1492+ elif ln_ratio_sign < 0 :
1493+ ln_ratio_filter = 'AND log(cp.concept_count * pc.count / (c1.concept_count * c2.concept_count + 0E0)) < 0'
1494+
1495+ if concept_id_2 is not None :
1496+ # concept_id_2 is specified, only return the results for the pair (concept_id_1, concept_id_2)
1497+ concept_id_2 = int (concept_id_2 )
1498+ order = concept_id_1 < concept_id_2
1499+ sql = '''SELECT
1500+ cp.dataset_id,
1501+ cp.concept_id_1 AS {rename_id_1},
1502+ cp.concept_id_2 AS {rename_id_2},
1503+ c1.concept_count AS {rename_count_1},
1504+ c2.concept_count AS {rename_count_2},
1505+ cp.concept_count AS concept_pair_count,
1506+ c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1507+ p_value,
1508+ ln_ratio,
1509+ ln_ratio_ci_lo,
1510+ ln_ratio_ci_hi,
1511+ cp.concept_count / (c1.concept_count + 0E0) AS {rename_rf1},
1512+ cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS {rename_rf1_ci_lo},
1513+ cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS {rename_rf1_ci_hi},
1514+ cp.concept_count / (c2.concept_count + 0E0) AS {rename_rf2},
1515+ cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS {rename_rf2_ci_lo},
1516+ cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS {rename_rf2_ci_hi},
1517+ log_odds,
1518+ log_odds_ci_lo,
1519+ log_odds_ci_hi,
1520+ pc.count AS patient_count
1521+ FROM cohd.concept_pair_counts cp
1522+ JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1523+ JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1524+ JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1525+ WHERE cp.dataset_id = %(dataset_id)s
1526+ AND c1.dataset_id = %(dataset_id)s
1527+ AND c2.dataset_id = %(dataset_id)s
1528+ AND cp.concept_id_1 = %(concept_id_1)s
1529+ AND cp.concept_id_2 = %(concept_id_2)s
1530+ {ln_ratio_filter}
1531+ ;'''
1532+ params = {
1533+ 'dataset_id' : dataset_id ,
1534+ 'concept_id_1' : concept_id_1 if order else concept_id_2 ,
1535+ 'concept_id_2' : concept_id_2 if order else concept_id_1
1536+ }
1537+ rename_id_1 = 'concept_id_1'
1538+ rename_id_2 = 'concept_id_2'
1539+ rename_count_1 = 'concept_1_count'
1540+ rename_count_2 = 'concept_2_count'
1541+ rename_rf1 = 'relative_frequency_1'
1542+ rename_rf2 = 'relative_frequency_2'
1543+ rename_rf1_ci_lo = 'rf1_ci_lo'
1544+ rename_rf1_ci_hi = 'rf1_ci_hi'
1545+ rename_rf2_ci_lo = 'rf2_ci_lo'
1546+ rename_rf2_ci_hi = 'rf2_ci_hi'
1547+
1548+ if not order :
1549+ rename_id_1 = 'concept_id_2'
1550+ rename_id_2 = 'concept_id_1'
1551+ rename_count_1 = 'concept_2_count'
1552+ rename_count_2 = 'concept_1_count'
1553+ rename_rf1 = 'relative_frequency_2'
1554+ rename_rf2 = 'relative_frequency_1'
1555+ rename_rf1_ci_lo = 'rf2_ci_lo'
1556+ rename_rf1_ci_hi = 'rf2_ci_hi'
1557+ rename_rf2_ci_lo = 'rf1_ci_lo'
1558+ rename_rf2_ci_hi = 'rf1_ci_hi'
1559+ sql = sql .format (rename_id_1 = rename_id_1 , rename_id_2 = rename_id_2 , rename_count_1 = rename_count_1 ,
1560+ rename_count_2 = rename_count_2 , rename_rf1 = rename_rf1 , rename_rf2 = rename_rf2 ,
1561+ ln_ratio_filter = ln_ratio_filter ,
1562+ rename_rf1_ci_lo = rename_rf1_ci_lo , rename_rf1_ci_hi = rename_rf1_ci_hi ,
1563+ rename_rf2_ci_lo = rename_rf2_ci_lo , rename_rf2_ci_hi = rename_rf2_ci_hi )
1564+
1565+ else :
1566+ # If concept_id_2 is not specified, get results for all pairs that include concept_id_1
1567+ sql = '''SELECT *
1568+ FROM
1569+ ((SELECT
1570+ cp.dataset_id,
1571+ cp.concept_id_1,
1572+ cp.concept_id_2,
1573+ c1.concept_count AS concept_1_count,
1574+ c2.concept_count AS concept_2_count,
1575+ cp.concept_count AS concept_pair_count,
1576+ c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1577+ p_value,
1578+ ln_ratio,
1579+ ln_ratio_ci_lo,
1580+ ln_ratio_ci_hi,
1581+ cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_1,
1582+ cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf1_ci_lo,
1583+ cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf1_ci_hi,
1584+ cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_2,
1585+ cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf2_ci_lo,
1586+ cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf2_ci_hi,
1587+ log_odds,
1588+ log_odds_ci_lo,
1589+ log_odds_ci_hi,
1590+ c.concept_name AS concept_2_name,
1591+ c.domain_id AS concept_2_domain,
1592+ c.concept_class_id AS concept_2_class_id,
1593+ pc.count AS patient_count
1594+ FROM cohd.concept_pair_counts cp
1595+ JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1596+ JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1597+ JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1598+ JOIN cohd.concept c ON cp.concept_id_2 = c.concept_id
1599+ WHERE cp.dataset_id = %(dataset_id)s
1600+ AND c1.dataset_id = %(dataset_id)s
1601+ AND c2.dataset_id = %(dataset_id)s
1602+ AND cp.concept_id_1 = %(concept_id_1)s
1603+ {domain_filter}
1604+ {concept_class_filter}
1605+ {ln_ratio_filter})
1606+ UNION
1607+ (SELECT
1608+ cp.dataset_id,
1609+ cp.concept_id_2 AS concept_id_1,
1610+ cp.concept_id_1 AS concept_id_2,
1611+ c2.concept_count AS concept_1_count,
1612+ c1.concept_count AS concept_2_count,
1613+ cp.concept_count AS concept_pair_count,
1614+ c1.concept_count * c2.concept_count / (pc.count + 0E0) AS expected_count,
1615+ p_value,
1616+ ln_ratio,
1617+ ln_ratio_ci_lo,
1618+ ln_ratio_ci_hi,
1619+ cp.concept_count / (c2.concept_count + 0E0) AS relative_frequency_1,
1620+ cp.pair_count_ci_lo / (c2.ci_hi + 0E0) AS rf1_ci_lo,
1621+ cp.pair_count_ci_hi / (c2.ci_lo + 0E0) AS rf1_ci_hi,
1622+ cp.concept_count / (c1.concept_count + 0E0) AS relative_frequency_2,
1623+ cp.pair_count_ci_lo / (c1.ci_hi + 0E0) AS rf2_ci_lo,
1624+ cp.pair_count_ci_hi / (c1.ci_lo + 0E0) AS rf2_ci_hi,
1625+ log_odds,
1626+ log_odds_ci_lo,
1627+ log_odds_ci_hi,
1628+ c.concept_name AS concept_2_name,
1629+ c.domain_id AS concept_2_domain,
1630+ c.concept_class_id AS concept_2_class_id,
1631+ pc.count AS patient_count
1632+ FROM cohd.concept_pair_counts cp
1633+ JOIN cohd.concept_counts c1 ON cp.concept_id_1 = c1.concept_id
1634+ JOIN cohd.concept_counts c2 ON cp.concept_id_2 = c2.concept_id
1635+ JOIN cohd.patient_count pc ON cp.dataset_id = pc.dataset_id
1636+ JOIN cohd.concept c ON cp.concept_id_1 = c.concept_id
1637+ WHERE cp.dataset_id = %(dataset_id)s
1638+ AND c1.dataset_id = %(dataset_id)s
1639+ AND c2.dataset_id = %(dataset_id)s
1640+ AND cp.concept_id_2 = %(concept_id_1)s
1641+ {domain_filter}
1642+ {concept_class_filter}
1643+ {ln_ratio_filter})) x
1644+ ORDER BY ABS(ln_ratio) DESC;'''
1645+ params = {
1646+ 'dataset_id' : dataset_id ,
1647+ 'concept_id_1' : concept_id_1 ,
1648+ }
1649+
1650+ if domain_id is not None and not domain_id == ['' ]:
1651+ # Restrict the associated concept by domain
1652+ domain_filter = 'AND c.domain_id = %(domain_id)s'
1653+ params ['domain_id' ] = domain_id
1654+ else :
1655+ # Unrestricted domain
1656+ domain_filter = ''
1657+
1658+ # Filter concepts by concept_class
1659+ if concept_class_id is None or not concept_class_id or concept_class_id == ['' ] or \
1660+ concept_class_id .isspace ():
1661+ concept_class_filter = ''
1662+ else :
1663+ concept_class_filter = 'AND concept_class_id = %(concept_class_id)s'
1664+ params ['concept_class_id' ] = concept_class_id
1665+
1666+ sql = sql .format (domain_filter = domain_filter , concept_class_filter = concept_class_filter ,
1667+ ln_ratio_filter = ln_ratio_filter )
1668+
1669+ cur .execute (sql , params )
1670+ json_return = cur .fetchall ()
1671+
1672+ # Perform calculations for results
1673+ for row in json_return :
1674+ cpc = row ['concept_pair_count' ]
1675+ c1 = row ['concept_1_count' ]
1676+ c2 = row ['concept_2_count' ]
1677+
1678+ # Confidence interval for obsExpRatio
1679+ # The CI bounds may hit Inf, which causes issues with JSON serialization. Limit it to 999
1680+ row ['ln_ratio_ci' ] = (clip (row ['ln_ratio_ci_lo' ], JSON_INFINITY_REPLACEMENT ),
1681+ clip (row ['ln_ratio_ci_hi' ], JSON_INFINITY_REPLACEMENT ))
1682+
1683+ # Confidence intervals for relative frequencies
1684+ row ['relative_frequency_1_ci' ] = row ['rf1_ci_lo' ], min (row ['rf1_ci_hi' ], JSON_INFINITY_REPLACEMENT )
1685+ row ['relative_frequency_2_ci' ] = row ['rf2_ci_lo' ], min (row ['rf2_ci_hi' ], JSON_INFINITY_REPLACEMENT )
1686+
1687+ # Chi-square
1688+ p_value = row ['p_value' ]
1689+ row ['chi_square_p-value' ] = max (p_value , MIN_P )
1690+ row ['chi_square_p-value_adjusted' ] = max (min (p_value * pair_count , 1.0 ), MIN_P ) # Bonferonni adjustment
1691+
1692+ # Log-odds
1693+ row ['log_odds' ] = clip (row ['log_odds' ], JSON_INFINITY_REPLACEMENT )
1694+ row ['log_odds_ci' ] = [clip (row ['log_odds_ci_lo' ], JSON_INFINITY_REPLACEMENT ),
1695+ clip (row ['log_odds_ci_hi' ], JSON_INFINITY_REPLACEMENT )]
1696+
1697+ cur .close ()
1698+ conn .close ()
1699+
1700+ json_return = {"results" : json_return }
1701+ return json_return
1702+
1703+
14281704def health ():
14291705 """ Quick health check of MySQL database
14301706 Simply queries the dataset table
@@ -1437,8 +1713,9 @@ def health():
14371713 cur = conn .cursor ()
14381714
14391715 try :
1440- sql = '''SELECT *
1441- FROM cohd.dataset;'''
1716+ sql = '''SELECT dataset_id
1717+ FROM cohd.dataset
1718+ LIMIT 1;'''
14421719 cur .execute (sql )
14431720 datasets = cur .fetchall ()
14441721 healthy = len (datasets ) > 0
0 commit comments