Skip to content
2 changes: 1 addition & 1 deletion docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ Individual classes will accept arguments upon initialization to set parameters r
* - ``SEQREPO_ROOT_DIR``
- Path to SeqRepo directory (i.e. contains ``aliases.sqlite3`` database file, and ``sequences`` directory). Used by :py:class:`SeqRepoAccess <cool_seq_tool.handlers.seqrepo_access.SeqRepoAccess>`. If not defined, defaults to ``/usr/local/share/seqrepo/latest``.
* - ``UTA_DB_URL``
- A `libpq connection string <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING>`_, i.e. of the form ``postgresql://<user>:<password>@<host>:<port>/<database>/<schema>``, used by the :py:class:`UtaDatabase <cool_seq_tool.sources.uta_database.UtaDatabase>` class. By default, it is set to ``postgresql://anonymous@localhost:5432/uta/uta_20241220``.
- A `libpq connection URI <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING-URIS>`_, i.e. of the form ``postgresql://<user>:<password>@<host>:<port>/<database>?options=search_path%3D<schema>,public``, used by the :py:class:`UtaDatabase <cool_seq_tool.sources.uta_database.UtaDatabase>` class. By default, it is set to ``postgresql://anonymous@localhost:5432/uta?options=-csearch_path%3Duta_20241220,public``.
* - ``LIFTOVER_CHAIN_37_TO_38``
- A path to a `chainfile <https://genome.ucsc.edu/goldenPath/help/chain.html>`_ for lifting from GRCh37 to GRCh38. Used by the :py:class:`LiftOver <cool_seq_tool.mappers.liftover.LiftOver>` class as input to `agct <https://pypi.org/project/agct/>`_. If not provided, agct will fetch it automatically from UCSC.
* - ``LIFTOVER_CHAIN_38_TO_37``
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ description = "Common Operation on Lots of Sequences Tool"
license = "MIT"
license-files = ["LICENSE"]
dependencies = [
"asyncpg",
"psycopg[pool]",
"boto3",
"agct >= 0.2.0rc1",
"polars ~= 1.0",
Expand All @@ -37,6 +37,9 @@ dependencies = [
dynamic = ["version"]

[project.optional-dependencies]
pg_binary = [
"psycopg[binary, pool]",
]
dev = [
"prek>=0.2.23",
"ipython",
Expand Down
19 changes: 11 additions & 8 deletions src/cool_seq_tool/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
data handler and mapping resources for straightforward access.
"""

import logging
from pathlib import Path

from biocommons.seqrepo import SeqRepo
from psycopg_pool import AsyncConnectionPool

from cool_seq_tool.handlers.seqrepo_access import SEQREPO_ROOT_DIR, SeqRepoAccess
from cool_seq_tool.mappers import (
Expand All @@ -16,9 +16,7 @@
)
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
from cool_seq_tool.sources.transcript_mappings import TranscriptMappings
from cool_seq_tool.sources.uta_database import UTA_DB_URL, UtaDatabase

_logger = logging.getLogger(__name__)
from cool_seq_tool.sources.uta_database import LazyUtaDatabase, UtaDatabase


class CoolSeqTool:
Expand All @@ -40,7 +38,7 @@ def __init__(
transcript_file_path: Path | None = None,
lrg_refseqgene_path: Path | None = None,
mane_data_path: Path | None = None,
db_url: str = UTA_DB_URL,
uta_connection_pool: AsyncConnectionPool | None = None,
sr: SeqRepo | None = None,
force_local_files: bool = False,
) -> None:
Expand Down Expand Up @@ -75,8 +73,10 @@ def __init__(
:param transcript_file_path: The path to ``transcript_mapping.tsv``
:param lrg_refseqgene_path: The path to the LRG_RefSeqGene file
:param mane_data_path: Path to RefSeq MANE summary data
:param db_url: PostgreSQL connection URL
Format: ``driver://user:password@host/database/schema``
:param uta_connection_pool: pyscopg connection pool to UTA instance. If not
provided, a lazy UTA connection will be used, meaning the connection won't
be initiated until the first attempted UTA query, and will use environment
configs/library defaults
:param sr: SeqRepo instance. If this is not provided, will create a new instance
:param force_local_files: if ``True``, don't check for or try to acquire latest
versions of static data files -- just use most recently available, if any
Expand All @@ -92,7 +92,10 @@ def __init__(
self.mane_transcript_mappings = ManeTranscriptMappings(
mane_data_path=mane_data_path, from_local=force_local_files
)
self.uta_db = UtaDatabase(db_url=db_url)
if uta_connection_pool:
self.uta_db = UtaDatabase(uta_connection_pool)
else:
self.uta_db = LazyUtaDatabase()
self.alignment_mapper = AlignmentMapper(
self.seqrepo_access, self.transcript_mappings, self.uta_db
)
Expand Down
19 changes: 11 additions & 8 deletions src/cool_seq_tool/mappers/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ async def p_to_c(
* Warning, if unable to translate to cDNA representation. Else ``None``
"""
# Get cDNA accession
temp_c_ac = await self.uta_db.p_to_c_ac(p_ac)
async with self.uta_db.repository() as uta:
temp_c_ac = await uta.p_to_c_ac(p_ac)
if temp_c_ac:
c_ac = temp_c_ac[-1]
else:
Expand Down Expand Up @@ -89,7 +90,8 @@ async def _get_cds_start(self, c_ac: str) -> tuple[int | None, str | None]:
- CDS start site if found. Else ``None``
- Warning, if unable to get CDS start. Else ``None``
"""
cds_start_end = await self.uta_db.get_cds_start_end(c_ac)
async with self.uta_db.repository() as uta:
cds_start_end = await uta.get_cds_start_end(c_ac)
if not cds_start_end:
cds_start = None
warning = f"Accession {c_ac} not found in UTA db"
Expand Down Expand Up @@ -149,12 +151,13 @@ async def c_to_g(
c_start_pos -= 1

# Get aligned genomic and transcript data
genomic_tx_data = await self.uta_db.get_genomic_tx_data(
c_ac,
(c_start_pos + cds_start, c_end_pos + cds_start),
AnnotationLayer.CDNA,
target_genome_assembly=target_genome_assembly,
)
async with self.uta_db.repository() as uta:
genomic_tx_data = await uta.get_genomic_tx_data(
c_ac,
(c_start_pos + cds_start, c_end_pos + cds_start),
AnnotationLayer.CDNA,
target_genome_assembly=target_genome_assembly,
)

if not genomic_tx_data:
warning = (
Expand Down
141 changes: 87 additions & 54 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@
TranscriptPriority,
)
from cool_seq_tool.sources.mane_transcript_mappings import ManeTranscriptMappings
from cool_seq_tool.sources.uta_database import GenomicAlnData, UtaDatabase
from cool_seq_tool.sources.uta_database import (
GenomicAlnData,
NoMatchingAlignmentError,
UtaDatabase,
)
from cool_seq_tool.utils import service_meta

_logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -637,29 +641,43 @@ async def _get_all_exon_coords(
The list will be ordered by ascending exon number.
"""
if genomic_ac:
query = f"""
query = """
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
FROM {self.uta_db.schema}.tx_exon_aln_mv
WHERE tx_ac = '{tx_ac}'
FROM tx_exon_aln_mv
WHERE tx_ac = %(tx_ac)s
AND alt_aln_method = 'splign'
AND alt_ac = '{genomic_ac}'
ORDER BY ord ASC
""" # noqa: S608
AND alt_ac = %(genomic_ac)s
ORDER BY ord ASC;
"""
else:
query = f"""
query = """
SELECT DISTINCT ord, tx_start_i, tx_end_i, alt_start_i, alt_end_i, alt_strand
FROM {self.uta_db.schema}.tx_exon_aln_mv as t
INNER JOIN {self.uta_db.schema}._seq_anno_most_recent as s
FROM tx_exon_aln_mv as t
INNER JOIN _seq_anno_most_recent as s
ON t.alt_ac = s.ac
WHERE s.descr = ''
AND t.tx_ac = '{tx_ac}'
AND t.tx_ac = %(tx_ac)s
AND t.alt_aln_method = 'splign'
AND t.alt_ac like 'NC_000%'
ORDER BY ord ASC
""" # noqa: S608
AND t.alt_ac like 'NC_000%%'
ORDER BY ord ASC;
"""

results = await self.uta_db.execute_query(query)
return [_ExonCoord(**r) for r in results]
async with self.uta_db.repository() as uta:
cursor = await uta.execute_query(
query, {"tx_ac": tx_ac, "genomic_ac": genomic_ac}
)
results = await cursor.fetchall()
return [
_ExonCoord(
ord=r[0],
tx_start_i=r[1],
tx_end_i=r[2],
alt_start_i=r[3],
alt_end_i=r[4],
alt_strand=r[5],
)
for r in results
]

async def _get_genomic_aln_coords(
self,
Expand Down Expand Up @@ -690,13 +708,14 @@ async def _get_genomic_aln_coords(
aligned_coords = {"start": None, "end": None}
for exon, key in [(tx_exon_start, "start"), (tx_exon_end, "end")]:
if exon:
aligned_coord, warning = await self.uta_db.get_alt_ac_start_or_end(
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
)
if aligned_coord:
async with self.uta_db.repository() as uta:
try:
aligned_coord = await uta.get_alt_ac_start_or_end(
tx_ac, exon.tx_start_i, exon.tx_end_i, gene=gene
)
except NoMatchingAlignmentError as e:
return None, None, str(e)
aligned_coords[key] = aligned_coord
else:
return None, None, warning

return *aligned_coords.values(), None

Expand Down Expand Up @@ -827,19 +846,22 @@ async def _genomic_to_tx_segment(

# Validate inputs exist in UTA
if gene:
gene_validation = await self.uta_db.gene_exists(gene)
async with self.uta_db.repository() as uta:
gene_validation = await uta.gene_exists(gene)
if not gene_validation:
return GenomicTxSeg(errors=[f"Gene does not exist in UTA: {gene}"])

if transcript:
transcript_validation = await self.uta_db.transcript_exists(transcript)
async with self.uta_db.repository() as uta:
transcript_validation = await uta.transcript_exists(transcript)
if not transcript_validation:
return GenomicTxSeg(
errors=[f"Transcript does not exist in UTA: {transcript}"]
)

if genomic_ac:
grch38_ac = await self.uta_db.get_newest_assembly_ac(genomic_ac)
async with self.uta_db.repository() as uta:
grch38_ac = await uta.get_newest_assembly_ac(genomic_ac)
if grch38_ac:
genomic_ac = grch38_ac[0]
else:
Expand Down Expand Up @@ -888,16 +910,20 @@ async def _genomic_to_tx_segment(
transcript = results.refseq
else:
# Run if gene is for a noncoding transcript
query = f"""
query = """
SELECT DISTINCT tx_ac
FROM {self.uta_db.schema}.tx_exon_aln_mv
WHERE hgnc = '{gene}'
AND alt_ac = '{genomic_ac}'
""" # noqa: S608
result = await self.uta_db.execute_query(query)
FROM tx_exon_aln_mv
WHERE hgnc = %(gene)s
AND alt_ac = %(genomic_ac)s;
"""
async with self.uta_db.repository() as uta:
cursor = await uta.execute_query(
query, {"gene": gene, "genomic_ac": genomic_ac}
)
result = await cursor.fetchone()

if result:
transcript = result[0]["tx_ac"]
transcript = result[0]
else:
return GenomicTxSeg(
errors=[
Expand Down Expand Up @@ -955,13 +981,14 @@ async def _genomic_to_tx_segment(
)
else:
is_exonic = True
exon_data = await self.uta_db.get_tx_exon_aln_data(
transcript,
genomic_pos,
genomic_pos,
alt_ac=genomic_ac,
use_tx_pos=False,
)
async with self.uta_db.repository() as uta:
exon_data = await uta.get_tx_exon_aln_data(
transcript,
genomic_pos,
genomic_pos,
alt_ac=genomic_ac,
use_tx_pos=False,
)
exon_num = exon_data[0].ord

offset = self._get_exon_offset(
Expand Down Expand Up @@ -1030,20 +1057,24 @@ async def _validate_genomic_breakpoint(
for the transcript, ``False`` if not. Breakpoints past this threshold
are likely erroneous.
"""
query = f"""
query = """
WITH tx_boundaries AS (
SELECT
MIN(alt_start_i) AS min_start,
MAX(alt_end_i) AS max_end
FROM {self.uta_db.schema}.tx_exon_aln_mv
WHERE tx_ac = '{tx_ac}'
AND alt_ac = '{genomic_ac}'
FROM tx_exon_aln_mv
WHERE tx_ac = %(tx_ac)s
AND alt_ac = %(genomic_ac)s
)
SELECT * FROM tx_boundaries
WHERE {pos} between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150)
""" # noqa: S608
results = await self.uta_db.execute_query(query)
return bool(results)
WHERE %(pos)s between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150);
"""
async with self.uta_db.repository() as uta:
cursor = await uta.execute_query(
query, {"tx_ac": tx_ac, "genomic_ac": genomic_ac, "pos": pos}
)
result = await cursor.fetchone()
return bool(result)

async def _get_tx_ac_gene(
self,
Expand All @@ -1058,18 +1089,20 @@ async def _get_tx_ac_gene(
:return: HGNC gene symbol associated to transcript and
warning
"""
query = f"""
query = """
SELECT DISTINCT hgnc
FROM {self.uta_db.schema}.tx_exon_aln_mv
WHERE tx_ac = '{tx_ac}'
FROM tx_exon_aln_mv
WHERE tx_ac = %(tx_ac)s
ORDER BY hgnc
LIMIT 1;
""" # noqa: S608
results = await self.uta_db.execute_query(query)
if not results:
"""
async with self.uta_db.repository() as uta:
cursor = await uta.execute_query(query, {"tx_ac": tx_ac})
result = await cursor.fetchone()
if not result:
return None, f"No gene(s) found given {tx_ac}"

return results[0]["hgnc"], None
return result[0], None

@staticmethod
def _is_exonic_breakpoint(pos: int, tx_genomic_coords: list[_ExonCoord]) -> bool:
Expand Down
Loading