Skip to content

Commit 722ef52

Browse files
committed
feat: refactor server capacity handling and streamline term fetching process
1 parent 8e606d0 commit 722ef52

1 file changed

Lines changed: 84 additions & 79 deletions

File tree

vfbterms.py

Lines changed: 84 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
#!/usr/bin/env python3
22

33
import sys
4-
from os import listdir, chdir
5-
from os.path import isfile, join
4+
from os import chdir
5+
from os.path import join
66
import os
77
import warnings
88
import re
99
import json
10-
import datetime
1110
import traceback
1211
import time
1312
from urllib.parse import quote
1413

1514
# Suppress the urllib3 warning about OpenSSL
1615
warnings.filterwarnings('ignore', category=Warning)
1716

17+
# Ensure progress logs appear promptly in non-interactive runners (e.g. Jenkins)
18+
if hasattr(sys.stdout, "reconfigure"):
19+
sys.stdout.reconfigure(line_buffering=True)
20+
1821
# Set environment variable to skip GUI dependencies
1922
os.environ['VFB_SKIP_GUI'] = '1'
2023

@@ -30,8 +33,9 @@
3033

3134
# Throttle settings — stay under 20 concurrent to keep API reliable
3235
MAX_ACTIVE_BEFORE_BACKOFF = 20 # Back off when this many queries are active
33-
BACKOFF_SECONDS = 120 # How long to wait when server is busy
3436
STATUS_CHECK_INTERVAL = 10 # Seconds between status checks while waiting
37+
MAX_CAPACITY_WAIT_SECONDS = 600 # Give up waiting for capacity after this long
38+
API_TIMEOUT_SECONDS = int(os.environ.get("VFB_API_TIMEOUT_SECONDS", "900"))
3539

3640
# Known ID prefixes for internal link conversion
3741
KNOWN_PREFIXES = (
@@ -74,29 +78,50 @@ def check_server_status():
7478
print(f"WARNING: Could not check server status: {e}")
7579
return None
7680

77-
def wait_for_server_capacity():
81+
def wait_for_server_capacity(term_id=""):
7882
"""Block until the server has capacity below our threshold.
7983
8084
Monitors the /status endpoint and waits when active queries >= MAX_ACTIVE_BEFORE_BACKOFF
81-
or when any queries are waiting in the queue. This is a low-priority process
82-
and should not flood the API servers.
85+
and retries when the status endpoint is unavailable. Waits are bounded so a
86+
single term cannot block forever.
8387
"""
88+
start_time = time.time()
89+
term_label = term_id or "request"
8490
while True:
8591
status = check_server_status()
92+
elapsed = time.time() - start_time
93+
8694
if status is None:
87-
# Can't reach status endpoint — back off to be safe
88-
print(f" Status endpoint unreachable, backing off {BACKOFF_SECONDS}s...")
89-
time.sleep(BACKOFF_SECONDS)
95+
if elapsed >= MAX_CAPACITY_WAIT_SECONDS:
96+
print(
97+
f"WARNING: Proceeding with {term_label} after {int(elapsed)}s because status checks failed."
98+
)
99+
return
100+
print(f" Status endpoint unreachable, retrying in {STATUS_CHECK_INTERVAL}s...")
101+
time.sleep(STATUS_CHECK_INTERVAL)
90102
continue
91103

92104
active, waiting = status
93-
if waiting > 0 or active >= MAX_ACTIVE_BEFORE_BACKOFF:
94-
print(f" Server busy: {active} active, {waiting} queued. "
95-
f"Backing off {BACKOFF_SECONDS}s... (threshold: {MAX_ACTIVE_BEFORE_BACKOFF})")
96-
time.sleep(BACKOFF_SECONDS)
105+
if active >= MAX_ACTIVE_BEFORE_BACKOFF:
106+
if elapsed >= MAX_CAPACITY_WAIT_SECONDS:
107+
print(
108+
f"WARNING: Proceeding with {term_label} after {int(elapsed)}s "
109+
f"while busy (active={active}, waiting={waiting})."
110+
)
111+
return
112+
print(
113+
f" Server busy: {active} active, {waiting} queued. "
114+
f"Retrying in {STATUS_CHECK_INTERVAL}s... (threshold: {MAX_ACTIVE_BEFORE_BACKOFF})"
115+
)
116+
time.sleep(STATUS_CHECK_INTERVAL)
97117
continue
98118

99-
# Server has capacity
119+
if waiting > 0:
120+
print(
121+
f" Server queue detected ({waiting}) but active load is {active}; proceeding."
122+
)
123+
124+
# Server has usable capacity
100125
return
101126

102127
# ─── Data Fetching ───────────────────────────────────────────────────────────
@@ -107,10 +132,10 @@ def fetch_term_info(term_id):
107132
Checks server capacity before making the request to avoid flooding.
108133
"""
109134
# Wait until the server isn't overloaded
110-
wait_for_server_capacity()
135+
wait_for_server_capacity(term_id)
111136

112137
try:
113-
resp = session.get(API_BASE, params={"id": term_id}, timeout=9000)
138+
resp = session.get(API_BASE, params={"id": term_id}, timeout=API_TIMEOUT_SECONDS)
114139
resp.raise_for_status()
115140
data = resp.json()
116141
if not data or not data.get("Id"):
@@ -570,6 +595,21 @@ def get_vfb_connect():
570595
_vc = VfbConnect()
571596
return _vc
572597

598+
def fetch_ids(label, query):
599+
"""Fetch and log IDs for one query."""
600+
print(f"Fetching ID list for {label}...")
601+
data = get_vfb_connect().nc.commit_list([query])
602+
ids = data[0]['data'][0]['row'][0]
603+
print(f" Retrieved {len(ids)} IDs for {label}")
604+
return ids
605+
606+
def process_group(base_path, relative_dir, label, query):
607+
"""Change directory, fetch IDs, and generate pages for one ontology group."""
608+
target_dir = os.path.normpath(join(base_path, relative_dir))
609+
print(f"\n[{label}] {target_dir}")
610+
chdir(target_dir)
611+
save_terms(fetch_ids(label, query))
612+
573613
def save_terms(ids):
574614
"""Fetch and save term pages for a list of IDs."""
575615
total = len(ids)
@@ -835,68 +875,33 @@ def test_medulla_page():
835875
if len(sys.argv) > 1:
836876
mypath = sys.argv[1]
837877
print("Updating all files in " + mypath)
838-
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
839-
840-
chdir(mypath + 'fbbt/')
841-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'FBbt' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
842-
chdir(mypath + 'fbbi/')
843-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'FBbi' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
844-
chdir(mypath + 'fbcv/')
845-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'FBcv' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
846-
chdir(mypath + 'fbdv/')
847-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'FBdv' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
848-
chdir(mypath + 'vfb/')
849-
vfb = get_vfb_connect().nc.commit_list(["MATCH (n:Individual) WHERE n.short_form starts with 'VFB_' AND NOT n.short_form starts with 'VFB_internal' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0]
850-
save_terms(vfb)
851-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'VFBexp_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
852-
853-
chdir(mypath + '../datasets/')
854-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:DataSet) with n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
855-
856-
chdir(mypath + 'go/')
857-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'GO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
858-
859-
chdir(mypath + 'so/')
860-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'SO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
861-
862-
chdir(mypath + 'ioa/')
863-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'IAO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
864-
865-
chdir(mypath + 'geno/')
866-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'GENO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
867-
868-
chdir(mypath + 'pato/')
869-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'PATO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
870-
871-
chdir(mypath + 'pco/')
872-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'PCO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
873-
874-
chdir(mypath + 'uberon/')
875-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'UBERON_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
876-
877-
chdir(mypath + 'ro/')
878-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'RO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
879-
880-
chdir(mypath + 'obi/')
881-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'OBI_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
882-
883-
chdir(mypath + 'ncbitaxon/')
884-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'NCBITaxon_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
885-
886-
chdir(mypath + 'zp/')
887-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'ZP_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
888-
889-
chdir(mypath + 'wbphenotype/')
890-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'WBPhenotype_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
891-
892-
chdir(mypath + 'caro/')
893-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'CARO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
894-
895-
chdir(mypath + 'bfo/')
896-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'BFO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
897-
898-
chdir(mypath + 'flybase/')
899-
save_terms(get_vfb_connect().nc.commit_list(["MATCH (n:Class) WHERE n.short_form starts with 'FB' AND NOT n.short_form contains '_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"])[0]['data'][0]['row'][0])
878+
groups = [
879+
('fbbt/', 'FBbt classes', "MATCH (n:Class) WHERE n.short_form starts with 'FBbt' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
880+
('fbbi/', 'FBbi classes', "MATCH (n:Class) WHERE n.short_form starts with 'FBbi' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
881+
('fbcv/', 'FBcv classes', "MATCH (n:Class) WHERE n.short_form starts with 'FBcv' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
882+
('fbdv/', 'FBdv classes', "MATCH (n:Class) WHERE n.short_form starts with 'FBdv' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
883+
('vfb/', 'VFB individuals', "MATCH (n:Individual) WHERE n.short_form starts with 'VFB_' AND NOT n.short_form starts with 'VFB_internal' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
884+
('vfb/', 'VFBexp classes', "MATCH (n:Class) WHERE n.short_form starts with 'VFBexp_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
885+
('../datasets/', 'Datasets', "MATCH (n:DataSet) with n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
886+
('go/', 'GO classes', "MATCH (n:Class) WHERE n.short_form starts with 'GO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
887+
('so/', 'SO classes', "MATCH (n:Class) WHERE n.short_form starts with 'SO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
888+
('ioa/', 'IAO classes', "MATCH (n:Class) WHERE n.short_form starts with 'IAO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
889+
('geno/', 'GENO classes', "MATCH (n:Class) WHERE n.short_form starts with 'GENO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
890+
('pato/', 'PATO classes', "MATCH (n:Class) WHERE n.short_form starts with 'PATO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
891+
('pco/', 'PCO classes', "MATCH (n:Class) WHERE n.short_form starts with 'PCO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
892+
('uberon/', 'UBERON classes', "MATCH (n:Class) WHERE n.short_form starts with 'UBERON_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
893+
('ro/', 'RO classes', "MATCH (n:Class) WHERE n.short_form starts with 'RO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
894+
('obi/', 'OBI classes', "MATCH (n:Class) WHERE n.short_form starts with 'OBI_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
895+
('ncbitaxon/', 'NCBITaxon classes', "MATCH (n:Class) WHERE n.short_form starts with 'NCBITaxon_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
896+
('zp/', 'ZP classes', "MATCH (n:Class) WHERE n.short_form starts with 'ZP_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
897+
('wbphenotype/', 'WBPhenotype classes', "MATCH (n:Class) WHERE n.short_form starts with 'WBPhenotype_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
898+
('caro/', 'CARO classes', "MATCH (n:Class) WHERE n.short_form starts with 'CARO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
899+
('bfo/', 'BFO classes', "MATCH (n:Class) WHERE n.short_form starts with 'BFO_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
900+
('flybase/', 'FlyBase classes', "MATCH (n:Class) WHERE n.short_form starts with 'FB' AND NOT n.short_form contains '_' WITH n.short_form as id ORDER BY id ASC RETURN collect(distinct id) as ids"),
901+
]
902+
903+
for relative_dir, label, query in groups:
904+
process_group(mypath, relative_dir, label, query)
900905

901906
else:
902907
print("Testing term page generation...")

0 commit comments

Comments
 (0)