Skip to content

Commit f15d597

Browse files
authored
Merge pull request #235 from TeresasaZ/dev
genes by positions + query genes databases, models, endpoints and tests
2 parents ddcf94e + 5bc4bf5 commit f15d597

8 files changed

Lines changed: 1364 additions & 1 deletion

File tree

api/models/eplant2.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,29 @@ class Publications(db.Model):
4949
journal: db.Mapped[str] = db.mapped_column(db.String(64), nullable=False, primary_key=True)
5050
title: db.Mapped[str] = db.mapped_column(TEXT(), nullable=False, primary_key=True)
5151
pubmed: db.Mapped[str] = db.mapped_column(db.String(16), nullable=False, primary_key=True)
52+
53+
54+
class TAIR10GFF3(db.Model):
55+
__bind_key__ = "eplant2"
56+
__tablename__ = "tair10_gff3"
57+
58+
SeqID: db.Mapped[str] = db.mapped_column(db.String(20), nullable=False, primary_key=True)
59+
Source: db.Mapped[str] = db.mapped_column(db.String(10), nullable=False, primary_key=True)
60+
Type: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
61+
Start: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True)
62+
End: db.Mapped[int] = db.mapped_column(db.Integer, nullable=False, primary_key=True)
63+
Score: db.Mapped[float] = db.mapped_column(db.Float, nullable=True, primary_key=True)
64+
Strand: db.Mapped[str] = db.mapped_column(db.String(1), nullable=True, primary_key=True)
65+
Phase: db.Mapped[str] = db.mapped_column(db.String(1), nullable=True, primary_key=True)
66+
Id: db.Mapped[str] = db.mapped_column(db.String(20), nullable=True, primary_key=True)
67+
geneId: db.Mapped[str] = db.mapped_column(db.String(20), nullable=True, primary_key=True)
68+
Parent: db.Mapped[str] = db.mapped_column(db.String(40), nullable=True, primary_key=True)
69+
Attributes: db.Mapped[str] = db.mapped_column(db.String(256), nullable=True, primary_key=True)
70+
71+
72+
class AgiAlias(db.Model):
73+
__bind_key__ = "eplant2"
74+
__tablename__ = "agi_alias"
75+
76+
agi: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)
77+
alias: db.Mapped[str] = db.mapped_column(db.String(30), nullable=False, primary_key=True)

api/resources/gene_information.py

Lines changed: 305 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from api.models.annotations_lookup import AgiAlias
55
from api.models.eplant2 import Isoforms as EPlant2Isoforms
66
from api.models.eplant2 import Publications as EPlant2Publications
7+
from api.models.eplant2 import TAIR10GFF3 as EPlant2TAIR10_GFF3
8+
from api.models.eplant2 import AgiAlias as EPlant2AgiAlias
9+
from api.models.eplant2 import AgiAnnotation as EPlant2AgiAnnotation
710
from api.models.eplant_poplar import Isoforms as EPlantPoplarIsoforms
811
from api.models.eplant_tomato import Isoforms as EPlantTomatoIsoforms
912
from api.models.eplant_soybean import Isoforms as EPlantSoybeanIsoforms
@@ -14,6 +17,16 @@
1417

1518
gene_information = Namespace("Gene Information", description="Information about Genes", path="/gene_information")
1619

20+
parser = gene_information.parser()
21+
parser.add_argument(
22+
"terms",
23+
type=list,
24+
action="append",
25+
required=True,
26+
help="Gene IDs, format example: AT1G01010",
27+
default=["AT1G01020", "AT1G01030"],
28+
)
29+
1730
# I think this is only needed for Swagger UI POST
1831
gene_information_request_fields = gene_information.model(
1932
"GeneInformation",
@@ -27,6 +40,18 @@
2740
},
2841
)
2942

43+
query_genes_request_fields = gene_information.model(
44+
"GeneInformation",
45+
{
46+
"species": fields.String(required=True, example="Arabidopsis_thaliana"),
47+
"terms": fields.List(
48+
required=True,
49+
example=["AT1G01010", "AT1G01020"],
50+
cls_or_instance=fields.String,
51+
),
52+
},
53+
)
54+
3055

3156
# Validation is done in a different way to keep things simple
3257
class GeneInformationSchema(Schema):
@@ -135,6 +160,286 @@ def get(self, species="", gene_id=""):
135160
return BARUtils.error_exit("There are no data found for the given gene")
136161

137162

163+
@gene_information.route("/genes_by_position/<string:species>/<string:chromosome>/<string:startParam>/<string:endParam>")
164+
class GeneTAIR10_GFF3(Resource):
165+
@gene_information.param("species", _in="path", default="arabidopsis")
166+
@gene_information.param("chromosome", _in="path", default="0")
167+
@gene_information.param("startParam", _in="path", default=3000)
168+
@gene_information.param("endParam", _in="path", default=6000)
169+
def get(self, species="", chromosome="", startParam="", endParam=""):
170+
"""This end point provides genes given position."""
171+
172+
# Check if all parameters are provided
173+
if not chromosome or not startParam or not endParam:
174+
return BARUtils.error_exit("Missing parameters"), 400
175+
176+
# Check if the start param is smaller than end param
177+
if startParam >= endParam:
178+
return BARUtils.error_exit("Start location should be smaller than the end location")
179+
180+
# Check if both parameters are valid figures
181+
if not BARUtils.is_integer(startParam) or not BARUtils.is_integer(endParam):
182+
return BARUtils.error_exit("At lease one parameter is not valid")
183+
184+
# Escape input
185+
species = escape(species)
186+
chromosome = escape(chromosome)
187+
startParam = escape(startParam)
188+
endParam = escape(endParam)
189+
190+
try:
191+
# Set database
192+
if species == "arabidopsis":
193+
database = EPlant2TAIR10_GFF3
194+
195+
# Map chromosome to its ID
196+
chromosomeId = '0'
197+
if chromosome == 'Chr1':
198+
chromosomeId = '1'
199+
elif chromosome == 'Chr2':
200+
chromosomeId = '2'
201+
elif chromosome == 'Chr3':
202+
chromosomeId = '3'
203+
elif chromosome == 'Chr4':
204+
chromosomeId = '4'
205+
elif chromosome == 'Chr5':
206+
chromosomeId = '5'
207+
elif chromosome == 'ChrC':
208+
chromosomeId = 'C'
209+
elif chromosome == 'ChrM':
210+
chromosomeId = 'M'
211+
else:
212+
return BARUtils.error_exit("Invalid chromosome"), 400
213+
else:
214+
return BARUtils.error_exit("No data for the given species"), 400
215+
216+
# Construct the query
217+
atnumg = "AT" + chromosomeId + "G"
218+
query1 = db.select(database.geneId, database.Start, database.End, database.Strand).where(
219+
database.Type == "gene",
220+
database.geneId.startswith(atnumg),
221+
(
222+
database.Start.between(startParam, endParam) |
223+
database.End.between(startParam, endParam) |
224+
((database.Start < startParam) & (database.End > endParam))
225+
)
226+
)
227+
result1 = db.session.execute(query1).all()
228+
gene_ids = [row[0] for row in result1]
229+
230+
# Get aliases
231+
query2 = db.select(EPlant2AgiAlias.agi, EPlant2AgiAlias.alias).where(EPlant2AgiAlias.agi.in_(gene_ids))
232+
result2 = db.session.execute(query2).all()
233+
all_aliases = {}
234+
for row in result2:
235+
if row[0] not in all_aliases:
236+
all_aliases[row[0]] = []
237+
all_aliases[row[0]].append(row[1])
238+
239+
# Get annotation
240+
query3 = db.select(EPlant2AgiAnnotation.agi, EPlant2AgiAnnotation.annotation).where(EPlant2AgiAnnotation.agi.in_(gene_ids))
241+
result3 = db.session.execute(query3).all()
242+
all_annotations = {}
243+
for row in result3:
244+
temp = row[1].split('__')
245+
if len(temp) > 1:
246+
all_annotations[row[0].upper()] = temp[1]
247+
else:
248+
all_annotations[row[0].upper()] = temp[0]
249+
250+
genes = []
251+
for row in result1:
252+
gene = {
253+
'id': row[0],
254+
'start': row[1],
255+
'end': row[2],
256+
'strand': row[3],
257+
'aliases': all_aliases.get(row[0], []),
258+
'annotation': all_annotations.get(row[0], None)
259+
}
260+
261+
genes.append(gene)
262+
return BARUtils.success_exit(genes)
263+
264+
except Exception as e:
265+
return BARUtils.error_exit(str(e)), 400
266+
267+
268+
@gene_information.route("/gene_query")
269+
class GeneQueryGene(Resource):
270+
@gene_information.expect(query_genes_request_fields)
271+
def post(self):
272+
"""This end point provides gene information for multiple genes given multiple terms."""
273+
274+
# Escape input
275+
data = request.get_json()
276+
species = data["species"]
277+
terms = data["terms"]
278+
for one_term in terms:
279+
one_term.upper()
280+
281+
try:
282+
# Species check
283+
if species != "Arabidopsis_thaliana":
284+
return BARUtils.error_exit("No data for the given species"), 400
285+
286+
# Term check
287+
for one_term in terms:
288+
if not BARUtils.is_arabidopsis_gene_valid(one_term):
289+
return BARUtils.error_exit("Input list contains invalid term"), 400
290+
291+
database = EPlant2AgiAlias
292+
gene_ids = []
293+
agi_fail = []
294+
for one_term in terms:
295+
query = db.select(database.agi).where(database.agi.contains(one_term)).limit(1)
296+
result = db.session.execute(query).fetchone()
297+
if not result:
298+
agi_fail.append(one_term)
299+
else:
300+
gene_ids.append(result[0])
301+
302+
# For terms that do not have results
303+
database = EPlant2TAIR10_GFF3
304+
for fail_term in agi_fail:
305+
query = db.select(database.geneId).where(
306+
(
307+
(database.Type == 'gene') |
308+
(database.Type == 'transposable_element_gene')
309+
),
310+
database.geneId.contains(fail_term)
311+
).limit(1)
312+
result = db.session.execute(query).fetchone()
313+
if result:
314+
gene_ids.append(result[0])
315+
316+
# Find information for each term
317+
query = db.select(database.geneId, database.Start, database.End, database.Strand).where(
318+
((database.Type == "gene") | (database.Type == "transposable_element_gene")),
319+
database.Source == "TAIR10",
320+
database.geneId.in_(gene_ids)
321+
)
322+
result = db.session.execute(query).all()
323+
genes_info = {}
324+
for row in result:
325+
if row[0] not in genes_info:
326+
gene = {}
327+
gene['id'] = row[0]
328+
gene['chromosome'] = 'Chr' + row[0][2:3]
329+
gene['start'] = row[1]
330+
gene['end'] = row[2]
331+
gene['strand'] = row[3]
332+
gene['aliases'] = []
333+
gene['annotation'] = None
334+
genes_info[row[0]] = gene
335+
336+
# Get aliases
337+
database = EPlant2AgiAlias
338+
query = db.select(database.agi, database.alias).where(database.agi.in_(gene_ids))
339+
result = db.session.execute(query).all()
340+
for row in result:
341+
if row[0] in genes_info:
342+
genes_info[row[0]]['aliases'].append(row[1])
343+
344+
# Get annotations
345+
database = EPlant2AgiAnnotation
346+
query = db.select(database.agi, database.annotation).where(database.agi.in_(gene_ids))
347+
result = db.session.execute(query)
348+
for row in result:
349+
if row[0].upper() in genes_info:
350+
temp = row[1].split('__')
351+
if len(temp) > 1:
352+
genes_info[row[0].upper()]['annotation'] = temp[1]
353+
else:
354+
genes_info[row[0].upper()]['annotation'] = temp[0]
355+
356+
return BARUtils.success_exit(genes_info)
357+
358+
except Exception as e:
359+
return BARUtils.error_exit(str(e)), 400
360+
361+
362+
@gene_information.route("/single_gene_query/<string:species>/<string:term>")
363+
class SingleGeneQueryGene(Resource):
364+
@gene_information.param("species", _in="path", default="Arabidopsis_thaliana")
365+
@gene_information.param("term", _in="path", default="AT1G01010")
366+
def get(self, species="", term=""):
367+
"""This end point provides gene information for a single gene given one term."""
368+
369+
# Escape input
370+
species = escape(species)
371+
term = escape(term).upper()
372+
373+
try:
374+
# Species check
375+
if species != "Arabidopsis_thaliana":
376+
return BARUtils.error_exit("No data for the given species"), 400
377+
378+
# Term check
379+
if not BARUtils.is_arabidopsis_gene_valid(term):
380+
return BARUtils.error_exit("Input term invalid"), 400
381+
382+
database = EPlant2AgiAlias
383+
query = db.select(database.agi).where(database.agi == term).limit(1)
384+
result = db.session.execute(query).fetchone()
385+
386+
if not result:
387+
database = EPlant2TAIR10_GFF3
388+
query = db.select(database.geneId).where(
389+
(
390+
(database.Type == 'gene') |
391+
(database.Type == 'transposable_element_gene')
392+
),
393+
database.geneId == term
394+
).limit(1)
395+
result = db.session.execute(query).fetchone()
396+
397+
genes_info = {}
398+
if result:
399+
# Find information for the term
400+
database = EPlant2TAIR10_GFF3
401+
query = db.select(database.geneId, database.Start, database.End, database.Strand).where(
402+
((database.Type == "gene") | (database.Type == "transposable_element_gene")),
403+
database.Source == "TAIR10",
404+
database.geneId == term
405+
)
406+
result = db.session.execute(query).fetchone()
407+
408+
gene = {}
409+
gene['id'] = result[0]
410+
gene['chromosome'] = 'Chr' + gene['id'][2:3]
411+
gene['start'] = result[1]
412+
gene['end'] = result[2]
413+
gene['strand'] = result[3]
414+
gene['aliases'] = []
415+
gene['annotation'] = None
416+
genes_info[result[0]] = gene
417+
418+
# Get aliases
419+
database = EPlant2AgiAlias
420+
query = db.select(database.agi, database.alias).where(database.agi == term)
421+
result = db.session.execute(query).all()
422+
for row in result:
423+
if row[1] not in gene['aliases']:
424+
gene['aliases'].append(row[1])
425+
426+
# Get annotations
427+
database = EPlant2AgiAnnotation
428+
query = db.select(database.agi, database.annotation).where(database.agi == term)
429+
result = db.session.execute(query).all()
430+
for row in result:
431+
temp = row[1].split('__')
432+
if len(temp) > 1:
433+
gene['annotation'] = temp[1]
434+
else:
435+
gene['annotation'] = temp[0]
436+
437+
return BARUtils.success_exit(genes_info)
438+
439+
except Exception as e:
440+
return BARUtils.error_exit(str(e)), 400
441+
442+
138443
@gene_information.route("/gene_isoforms/<string:species>/<string:gene_id>")
139444
class GeneIsoforms(Resource):
140445
@gene_information.param("species", _in="path", default="arabidopsis")

0 commit comments

Comments
 (0)