|
1 | | -import requests |
2 | | -import json |
3 | | -import os |
4 | | -from typing import List, Optional |
5 | | -from collections import OrderedDict |
6 | 1 | from flask_restx import Namespace, Resource |
7 | | -from flask import request, current_app |
8 | 2 | from markupsafe import escape |
9 | | -from sqlalchemy.engine.url import make_url |
10 | | -from sqlalchemy.exc import SQLAlchemyError |
11 | 3 |
|
12 | | -from api.models.efp_dynamic import SIMPLE_EFP_SAMPLE_MODELS |
13 | | -from api.models.efp_schemas import SIMPLE_EFP_DATABASE_SCHEMAS |
14 | | -from api.services.efp_bootstrap import bootstrap_simple_efp_databases |
15 | 4 | from api.services.efp_data import query_efp_database_dynamic |
16 | 5 |
|
17 | | -# efp proxy namespace provides two endpoints for gene expression data |
18 | | -# 1. /values talks to the live bar eplant cgi |
19 | | -# 2. /expression reads from our local or remote databases using one shared query |
20 | 6 | efp_proxy_ns = Namespace( |
21 | 7 | 'efp Proxy', |
22 | | - description='Expression data retrieveal service from BAR eplant databse.', |
| 8 | + description='Gene expression data from BAR eFP databases', |
23 | 9 | path='/efp_proxy', |
24 | 10 | ) |
25 | 11 |
|
26 | 12 |
|
27 | | -# rest endpoint that proxies requests to the external bar eplant api |
28 | | -# supports urls like /efp_proxy/values/atgenexp_stress/AT1G01010 |
29 | | -@efp_proxy_ns.route("/values/<string:database>/<string:gene_id>") |
30 | | -@efp_proxy_ns.doc( |
31 | | - description="Proxies requests to BAR ePlant API: /efp_proxy/values/{database}/{gene_id}", |
32 | | - params=OrderedDict([ |
33 | | - ( |
34 | | - "database", |
35 | | - { |
36 | | - "description": "Database/datasource for arabidopsis view (e.g., atgenexp_stress)", |
37 | | - "in": "path", |
38 | | - "default": "atgenexp_stress", |
39 | | - }, |
40 | | - ), |
41 | | - ( |
42 | | - "gene_id", |
43 | | - { |
44 | | - "description": "Gene ID to query (e.g., AT1G01010)", |
45 | | - "in": "path", |
46 | | - "default": "AT1G01010", |
47 | | - }, |
48 | | - ), |
49 | | - ( |
50 | | - "samples", |
51 | | - { |
52 | | - "description": "Optional list of sample IDs (repeat ?samples=SampleA&samples=SampleB); omit to fetch all samples. Legacy JSON arrays are still accepted.", |
53 | | - "in": "query", |
54 | | - "default": "", |
55 | | - }, |
56 | | - ), |
57 | | - ]), |
58 | | -) |
59 | | -class EFPValues(Resource): |
60 | | - @staticmethod |
61 | | - def parse_samples_query_values(raw_values: Optional[List[str]]) -> Optional[List[str]]: |
62 | | - """Normalize optional samples from the query string so legacy formats still work.""" |
63 | | - if not raw_values: |
64 | | - return None |
65 | | - |
66 | | - filtered = [value for value in raw_values if value] |
67 | | - if not filtered: |
68 | | - return None |
69 | | - |
70 | | - if len(filtered) > 1: |
71 | | - return filtered |
72 | | - |
73 | | - candidate = filtered[0].strip() |
74 | | - if not candidate: |
75 | | - return None |
76 | | - |
77 | | - # interpret json array strings because legacy clients sent one json string value |
78 | | - if candidate.startswith("[") and candidate.endswith("]"): |
79 | | - try: |
80 | | - parsed = json.loads(candidate) |
81 | | - if isinstance(parsed, list): |
82 | | - return [str(item).strip() for item in parsed if isinstance(item, str) and item.strip()] |
83 | | - except json.JSONDecodeError: |
84 | | - pass |
85 | | - |
86 | | - if "," in candidate: |
87 | | - # support comma-separated lists by splitting manually |
88 | | - split_values = [item.strip() for item in candidate.split(",") if item.strip()] |
89 | | - if split_values: |
90 | | - return split_values |
91 | | - |
92 | | - return [candidate] |
93 | | - |
94 | | - @staticmethod |
95 | | - def get_all_samples_for_view(datasource: str): |
96 | | - """Load all available samples for a datasource using the metadata json and fallbacks.""" |
97 | | - # point at the scraped metadata json so tests resolve it from the repo |
98 | | - path = os.path.join(os.getcwd(), "data/efp_info/efp_species_view_info.json") |
99 | | - |
100 | | - # check for datasources that need hardcoded samples |
101 | | - if datasource == "root_Schaefer_lab": |
102 | | - # this dataset is missing from the scraped metadata so we pin a curated set |
103 | | - print("[info] using hardcoded fallback samples for root_Schaefer_lab") |
104 | | - return ["WTCHG_203594_01", "WTCHG_203594_05", "WTCHG_203839_04", "WTCHG_203594_03", "WTCHG_203594_07", "WTCHG_203839_06", "WTCHG_203839_01", "WTCHG_203594_10", "WTCHG_203839_08", "WTCHG_129187_01", "WTCHG_129189_01", "WTCHG_129190_01", "WTCHG_129187_03", "WTCHG_129189_03", "WTCHG_129190_03", "WTCHG_129187_05", "WTCHG_129189_05", "WTCHG_129187_07", "WTCHG_131167_01", "WTCHG_125416_01", "WTCHG_129190_05", "WTCHG_131167_03", "WTCHG_125416_03", "WTCHG_129190_07", "WTCHG_131167_05", "WTCHG_125416_05", "WTCHG_129189_07"] |
105 | | - |
106 | | - if datasource == "atgenexp_stress": |
107 | | - # atgenexp stress views still rely on the json metadata so we keep a minimal fallback |
108 | | - print("[info] using fallback arabidopsis samples from json spec") |
109 | | - return ["AtGen_6_0011", "AtGen_6_0012", "AtGen_6_0021", "AtGen_6_0022", |
110 | | - "AtGen_6_0711", "AtGen_6_0712", "AtGen_6_0721", "AtGen_6_0722"] |
111 | | - |
112 | | - # check if metadata json file exists |
113 | | - if not os.path.exists(path): |
114 | | - # repo clones without fixtures can still run, just without auto-sample loading |
115 | | - print(f"[warn] metadata json not found at {path}") |
116 | | - return [] |
117 | | - |
118 | | - # try to load and parse the json metadata file |
119 | | - try: |
120 | | - with open(path, "r") as f: |
121 | | - metadata = json.load(f) |
122 | | - except Exception as e: |
123 | | - print(f"[error] unable to read json: {e}") |
124 | | - return [] |
125 | | - |
126 | | - # search through all species and views to find a matching datasource |
127 | | - for species, obj in metadata.items(): |
128 | | - views = obj.get("data", {}).get("views", {}) |
129 | | - for vname, vinfo in views.items(): |
130 | | - if vinfo.get("database") == datasource: |
131 | | - # collect all unique samples from all treatment groups |
132 | | - samples = [] |
133 | | - for group in vinfo.get("groups", {}).values(): |
134 | | - # each group stores multiple treatment buckets; flatten all of them |
135 | | - for treatment_samples in group.get("treatments", {}).values(): |
136 | | - samples.extend(treatment_samples) |
137 | | - print(f"[info] found {len(samples)} samples in json for {datasource}") |
138 | | - return sorted(set(samples)) |
139 | | - |
140 | | - print(f"[warn] datasource {datasource} not found in json") |
141 | | - return [] |
142 | | - |
143 | | - @staticmethod |
144 | | - def fetch_efp_data(datasource, gene_id, samples=None): |
145 | | - """Fetch gene expression data from the external bar eplant api. |
146 | | - Either use the samples provided or auto-fill the list before calling the cgi. |
147 | | - """ |
148 | | - # set up the external bar api url and basic query parameters |
149 | | - base_url = "https://bar.utoronto.ca//eplant/cgi-bin/plantefp.cgi" |
150 | | - query_params = [ |
151 | | - ("datasource", datasource), |
152 | | - ("id", gene_id), |
153 | | - ("format", "json"), |
154 | | - ] |
155 | | - samples_applied = False # track whether we hinted the cgi with explicit samples |
156 | | - |
157 | | - # handle optional sample filtering and expect a normalized list of sample ids |
158 | | - if samples: |
159 | | - cleaned_samples = [sample.strip() for sample in samples if isinstance(sample, str) and sample.strip()] |
160 | | - if cleaned_samples: |
161 | | - query_params.append(("samples", json.dumps(cleaned_samples))) |
162 | | - samples_applied = True |
163 | | - # no samples provided, so try to auto-load all samples for this datasource |
164 | | - else: |
165 | | - samples = EFPValues.get_all_samples_for_view(datasource) |
166 | | - if samples: |
167 | | - print(f"[info] auto-loaded {len(samples)} samples for datasource {datasource}") |
168 | | - query_params.append(("samples", json.dumps(samples))) |
169 | | - samples_applied = True |
170 | | - else: |
171 | | - # no metadata entry means the cgi decides which default samples to use |
172 | | - print(f"[warn] no samples found for datasource {datasource}") |
173 | | - |
174 | | - # make exactly one http get request to the bar eplant cgi with every sample packed in |
175 | | - response = requests.get(base_url, params=query_params) |
176 | | - url_called = response.url |
177 | | - |
178 | | - # check if the request failed with an http error code |
179 | | - if not response.ok: |
180 | | - # propagate error status so clients see the same http code the cgi returned |
181 | | - return {"success": False, "error": f"bar returned {response.status_code} for url {url_called}"}, response.status_code |
182 | | - |
183 | | - # attempt to parse json response and extract the data array |
184 | | - try: |
185 | | - data = response.json() |
186 | | - if isinstance(data, dict) and "data" in data: |
187 | | - data = data["data"] |
188 | | - except Exception: |
189 | | - # remote endpoint occasionally emits html error pages so treat them as no data |
190 | | - data = [] |
191 | | - |
192 | | - # if no results returned with samples, retry once without sample filtering |
193 | | - if (not data or data == []) and samples_applied: |
194 | | - retry_params = [ |
195 | | - ("datasource", datasource), |
196 | | - ("id", gene_id), |
197 | | - ("format", "json"), |
198 | | - ] |
199 | | - retry_resp = requests.get(base_url, params=retry_params) |
200 | | - # even if this second call fails, we still return an empty array to the caller |
201 | | - |
202 | | - try: |
203 | | - retry_data = retry_resp.json() |
204 | | - if isinstance(retry_data, dict) and "data" in retry_data: |
205 | | - retry_data = retry_data["data"] |
206 | | - except Exception: |
207 | | - # treat malformed fallback responses as empty to keep behavior predictable |
208 | | - retry_data = [] |
209 | | - |
210 | | - return { |
211 | | - "success": True, |
212 | | - "url_called": url_called, |
213 | | - "record_count": len(retry_data), |
214 | | - "data": retry_data, |
215 | | - "note": "no data returned with samples; fetched full view instead." |
216 | | - } |
217 | | - return { |
218 | | - "success": True, |
219 | | - "url_called": url_called, |
220 | | - "record_count": len(data) if isinstance(data, list) else 0, |
221 | | - "data": data # payload mirrors what the real cgi would have returned |
222 | | - } |
223 | | - |
224 | | - def get(self, database, gene_id): |
225 | | - # sanitize path parameters to prevent injection attacks |
226 | | - database = escape(database) |
227 | | - gene_id = escape(gene_id) |
228 | | - |
229 | | - # parse ?samples= query args once so downstream logic gets a normalized list |
230 | | - samples_arg = self.parse_samples_query_values(request.args.getlist("samples")) |
231 | | - |
232 | | - # delegate to fetch_efp_data which auto-loads samples when none provided |
233 | | - return self.fetch_efp_data(database, gene_id, samples=samples_arg) |
234 | | - |
235 | | - |
236 | | -# rest endpoint that uses the static schema catalog to query local sqlite databases |
237 | | -# supports urls like /efp_proxy/expression/sample_data/261585_at |
238 | 13 | @efp_proxy_ns.route("/expression/<string:database>/<string:gene_id>") |
239 | 14 | @efp_proxy_ns.doc( |
240 | | - description="Static eFP endpoint: /efp_proxy/expression/{database}/{gene_id}" |
| 15 | + description="Retrieve gene expression values from a specified eFP database." |
241 | 16 | ) |
242 | 17 | @efp_proxy_ns.param( |
243 | 18 | "gene_id", |
@@ -268,83 +43,4 @@ def get(self, database, gene_id): |
268 | 43 | return result, result.get("error_code", 500) |
269 | 44 |
|
270 | 45 |
|
271 | | -@efp_proxy_ns.route("/bootstrap/simple") |
272 | | -@efp_proxy_ns.doc( |
273 | | - description="Create or update the simple eFP MySQL databases using the in-memory schema definitions.", |
274 | | - params={ |
275 | | - "host": "Optional MySQL hostname override. Defaults to the host defined in SQLALCHEMY_BINDS.", |
276 | | - "port": "Optional MySQL port override. Defaults to the port defined in SQLALCHEMY_BINDS.", |
277 | | - "user": "Optional MySQL username override. Defaults to the username defined in SQLALCHEMY_BINDS.", |
278 | | - "password": "Optional MySQL password override. Defaults to the password defined in SQLALCHEMY_BINDS.", |
279 | | - "databases": "Optional list of database names to bootstrap. Defaults to every simple database.", |
280 | | - }, |
281 | | -) |
282 | | -class EFPSimpleBootstrap(Resource): |
283 | | - @staticmethod |
284 | | - def _infer_default_db_credentials(): |
285 | | - """Derive MySQL connection info for the simple eFP datasets from the configured binds.""" |
286 | | - binds = current_app.config.get("SQLALCHEMY_BINDS") or {} |
287 | | - for db_name in SIMPLE_EFP_DATABASE_SCHEMAS.keys(): |
288 | | - uri = binds.get(db_name) |
289 | | - if not uri: |
290 | | - continue |
291 | | - url = make_url(uri) |
292 | | - return { |
293 | | - "host": url.host or "localhost", |
294 | | - "port": url.port or 3306, |
295 | | - "user": url.username or "root", |
296 | | - "password": url.password or "", |
297 | | - } |
298 | | - raise ValueError("No SQLAlchemy bind configured for the simple eFP databases.") |
299 | | - |
300 | | - def post(self): |
301 | | - payload = request.get_json(silent=True) or {} |
302 | | - try: |
303 | | - defaults = self._infer_default_db_credentials() |
304 | | - except ValueError as exc: |
305 | | - return {"success": False, "error": str(exc)}, 500 |
306 | | - |
307 | | - host = payload.get("host") or defaults["host"] |
308 | | - try: |
309 | | - port_value = payload.get("port") |
310 | | - port = int(port_value) if port_value is not None else int(defaults["port"]) |
311 | | - except (TypeError, ValueError): |
312 | | - return {"success": False, "error": "port must be an integer"}, 400 |
313 | | - user = payload.get("user") or defaults["user"] |
314 | | - password = payload.get("password") or defaults["password"] |
315 | | - |
316 | | - databases = payload.get("databases") |
317 | | - if databases is not None: |
318 | | - if not isinstance(databases, list) or not all(isinstance(item, str) for item in databases): |
319 | | - return {"success": False, "error": "databases must be a list of names."}, 400 |
320 | | - |
321 | | - try: |
322 | | - results = bootstrap_simple_efp_databases( |
323 | | - host=host, |
324 | | - port=port, |
325 | | - user=user, |
326 | | - password=password, |
327 | | - databases=databases, |
328 | | - ) |
329 | | - except ValueError as exc: |
330 | | - return {"success": False, "error": str(exc)}, 400 |
331 | | - except SQLAlchemyError as exc: |
332 | | - return {"success": False, "error": str(exc)}, 500 |
333 | | - |
334 | | - model_info = [ |
335 | | - {"database": name, "model": model.__name__} |
336 | | - for name, model in SIMPLE_EFP_SAMPLE_MODELS.items() |
337 | | - if databases is None or name in databases |
338 | | - ] |
339 | | - |
340 | | - return { |
341 | | - "success": True, |
342 | | - "databases": results, |
343 | | - "models": model_info, |
344 | | - "note": "Simple eFP databases are materialized in MySQL while SQLAlchemy models remain dynamic.", |
345 | | - }, 200 |
346 | | - |
347 | | - |
348 | | -efp_proxy_ns.add_resource(EFPValues, '/values/<string:database>/<string:gene_id>') |
349 | 46 | efp_proxy_ns.add_resource(EFPExpression, '/expression/<string:database>/<string:gene_id>') |
350 | | -efp_proxy_ns.add_resource(EFPSimpleBootstrap, '/bootstrap/simple') |
0 commit comments