Skip to content

Commit e2c30e4

Browse files
committed
Merge branch 'feature/es-improvements' into develop
2 parents 36522d7 + 14fa96f commit e2c30e4

29 files changed

Lines changed: 3341 additions & 210 deletions

app/api/v1/endpoint_modules/gazetteer.py

Lines changed: 136 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import logging
23
import os
34
from typing import Optional
@@ -330,10 +331,24 @@ async def search_wof(
330331
q: str = Query(..., description="Search query"),
331332
limit: int = Query(10, description="Maximum number of results", ge=1, le=100),
332333
offset: int = Query(0, description="Number of results to skip", ge=0),
334+
exclude_placetypes: Optional[str] = Query(
335+
None,
336+
description="Comma-separated list of placetypes to exclude (default: microhood,neighbourhood,venue)",
337+
),
333338
request: Request = None,
334339
):
335340
"""Search Who's on First gazetteer."""
336341
try:
342+
# Default placetypes to exclude for autosuggestion
343+
if exclude_placetypes is None:
344+
exclude_placetypes = "localadmin,microhood,neighbourhood,venue"
345+
# Handle case where exclude_placetypes might be a Query object (when called directly in tests)
346+
elif not isinstance(exclude_placetypes, str):
347+
# If it's not a string (e.g., Query object), use default
348+
exclude_placetypes = "localadmin,microhood,neighbourhood,venue"
349+
350+
excluded_types = [pt.strip() for pt in exclude_placetypes.split(",") if pt.strip()]
351+
337352
# Build search query
338353
search_terms = q.split()
339354
conditions = []
@@ -346,6 +361,16 @@ async def search_wof(
346361
)
347362
)
348363

364+
# Exclude confusing placetypes for autosuggestion
365+
# Keep records where placetype is NULL or not in excluded list
366+
if excluded_types:
367+
conditions.append(
368+
or_(
369+
gazetteer_wof_spr.c.placetype.is_(None),
370+
~gazetteer_wof_spr.c.placetype.in_(excluded_types),
371+
)
372+
)
373+
349374
query = (
350375
select(gazetteer_wof_spr)
351376
.where(and_(*conditions))
@@ -356,13 +381,121 @@ async def search_wof(
356381

357382
results = await database.fetch_all(query)
358383

384+
# Collect wok_ids for batch fetching ancestors and geojson
385+
wok_ids = [result["wok_id"] for result in results]
386+
387+
# Fetch ancestors for all results in batch
388+
ancestors_map = {}
389+
if wok_ids:
390+
ancestors_query = select(gazetteer_wof_ancestors).where(
391+
gazetteer_wof_ancestors.c.wok_id.in_(wok_ids)
392+
)
393+
ancestors = await database.fetch_all(ancestors_query)
394+
395+
# Group ancestors by wok_id
396+
for ancestor in ancestors:
397+
wok_id = ancestor["wok_id"]
398+
if wok_id not in ancestors_map:
399+
ancestors_map[wok_id] = []
400+
ancestors_map[wok_id].append(dict(ancestor))
401+
402+
# Fetch ancestor names from spr table
403+
ancestor_ids = list(
404+
set(
405+
[
406+
a["ancestor_id"]
407+
for ancestors_list in ancestors_map.values()
408+
for a in ancestors_list
409+
]
410+
)
411+
)
412+
if ancestor_ids:
413+
# Compare ancestor_id (Integer) with wok_id (BigInteger) - PostgreSQL handles type coercion
414+
ancestor_spr_query = select(gazetteer_wof_spr).where(
415+
gazetteer_wof_spr.c.wok_id.in_(ancestor_ids)
416+
)
417+
ancestor_sprs = await database.fetch_all(ancestor_spr_query)
418+
ancestor_names_map = {spr["wok_id"]: spr["name"] for spr in ancestor_sprs}
419+
420+
# Add names to ancestors
421+
for wok_id, ancestors_list in ancestors_map.items():
422+
for ancestor in ancestors_list:
423+
ancestor["name"] = ancestor_names_map.get(ancestor["ancestor_id"])
424+
425+
# Fetch GeoJSON for all results in batch
426+
geojson_map = {}
427+
if wok_ids:
428+
geojson_query = (
429+
select(gazetteer_wof_geojson)
430+
.where(gazetteer_wof_geojson.c.wok_id.in_(wok_ids))
431+
.order_by(
432+
# Prefer non-alt geometries, then by source preference
433+
gazetteer_wof_geojson.c.is_alt.asc(),
434+
gazetteer_wof_geojson.c.source.asc(),
435+
)
436+
)
437+
geojson_records = await database.fetch_all(geojson_query)
438+
439+
# Group by wok_id, keeping only the first (best) one
440+
for geojson_record in geojson_records:
441+
wok_id = geojson_record["wok_id"]
442+
if wok_id not in geojson_map:
443+
geojson_map[wok_id] = dict(geojson_record)
444+
359445
# Convert results to JSON:API format
360446
data = []
361447
for row in results:
362448
row_dict = dict(row)
449+
wok_id = row_dict.get("wok_id")
450+
451+
# Get ancestors for this place
452+
ancestors = ancestors_map.get(wok_id, [])
453+
454+
# Build hierarchy: prefer region, county, locality for display
455+
hierarchy_parts = []
456+
hierarchy_placetypes = ["region", "county", "locality"]
457+
458+
# Sort ancestors by placetype priority
459+
sorted_ancestors = sorted(
460+
ancestors,
461+
key=lambda a: hierarchy_placetypes.index(a.get("ancestor_placetype", ""))
462+
if a.get("ancestor_placetype") in hierarchy_placetypes
463+
else 999,
464+
)
465+
466+
for ancestor in sorted_ancestors:
467+
if ancestor.get("ancestor_placetype") in hierarchy_placetypes and ancestor.get(
468+
"name"
469+
):
470+
hierarchy_parts.append(ancestor["name"])
471+
472+
# Build display name: "Name, Parent1, Parent2, Country"
473+
display_parts = [row_dict.get("name", "")]
474+
display_parts.extend(hierarchy_parts)
475+
if row_dict.get("country"):
476+
display_parts.append(row_dict["country"])
477+
display_name = ", ".join(display_parts)
478+
479+
# Get GeoJSON for this place
480+
geojson_record = geojson_map.get(wok_id)
481+
geojson_data = None
482+
if geojson_record:
483+
try:
484+
geojson_data = json.loads(geojson_record["body"])
485+
except (json.JSONDecodeError, TypeError):
486+
geojson_data = None
487+
363488
# Sanitize the data for JSON serialization
364489
row_dict = sanitize_for_json(row_dict)
365490

491+
# Sanitize ancestors/hierarchy data (may contain date fields)
492+
sanitized_ancestors = sanitize_for_json(ancestors)
493+
494+
# Add enhanced fields
495+
row_dict["display_name"] = display_name
496+
row_dict["hierarchy"] = sanitized_ancestors
497+
row_dict["geojson"] = geojson_data
498+
366499
# Format as JSON:API resource
367500
formatted_row = {
368501
"id": str(row_dict.get("wok_id", row_dict.get("id", ""))),
@@ -395,5 +528,6 @@ async def search_wof(
395528
return JSONResponse(content=reordered_response)
396529

397530
except Exception as e:
398-
logger.error(f"Error searching WOF: {str(e)}", exc_info=True)
399-
raise HTTPException(status_code=500, detail="Failed to search WOF") from e
531+
error_msg = str(e)
532+
logger.error(f"Error searching WOF: {error_msg}", exc_info=True)
533+
raise HTTPException(status_code=500, detail=f"Failed to search WOF: {error_msg}") from e

app/api/v1/endpoint_modules/resources.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,15 @@ async def get_resource_distributions(
258258
raise HTTPException(status_code=500, detail=str(e)) from e
259259

260260

261+
@router.get("/resources/{id}/links")
262+
async def get_resource_links(
263+
id: str,
264+
callback: Optional[str] = Query(None, description="JSONP callback name"),
265+
):
266+
"""Get all links for a resource."""
267+
return await LinkService.get_resource_links(id)
268+
269+
261270
@router.get("/resources/{id}/ogm")
262271
async def get_resource_ogm(
263272
id: str,
@@ -317,15 +326,6 @@ async def get_resource_ogm(
317326
return JSONResponse(content={"error": str(e)}, status_code=500)
318327

319328

320-
@router.get("/resources/{id}/links")
321-
async def get_resource_links(
322-
id: str,
323-
callback: Optional[str] = Query(None, description="JSONP callback name"),
324-
):
325-
"""Get all links for a resource."""
326-
return await LinkService.get_resource_links(id)
327-
328-
329329
@router.get("/resources/{id}/relationships")
330330
async def get_resource_relationships(
331331
id: str,
@@ -389,8 +389,7 @@ async def get_resource_static_map(
389389
id: str,
390390
request: Request = None,
391391
):
392-
"""Get a static map image for a resource based on its locn_geometry (or dcat_bbox as fallback).
393-
"""
392+
"""Get a static map image for a resource based on its locn_geometry (or dcat_bbox as fallback)."""
394393
try:
395394
# First check if the resource exists and has geometry
396395
async with async_session() as session:
@@ -506,16 +505,13 @@ async def get_resource_thumbnail(
506505
image_service = ImageService(resource_dict, distribution_context=distribution_context)
507506
thumbnail_url = image_service.get_thumbnail_url()
508507

509-
# Ensure placeholder when none available
510-
if not thumbnail_url:
511-
application_url = os.getenv("APPLICATION_URL", "http://localhost:8000").rstrip("/")
512-
thumbnail_url = f"{application_url}/api/v1/thumbnails/placeholder"
513-
514-
is_placeholder = "/api/v1/thumbnails/placeholder" in thumbnail_url
508+
# Determine if it's a placeholder (only true if URL is actually a placeholder)
509+
# If thumbnail_url is None, placeholder should be false (frontend uses resource class icon)
510+
is_placeholder = bool(thumbnail_url and "/thumbnails/placeholder" in str(thumbnail_url))
515511

516512
response_payload = {
517513
"id": id,
518-
"thumbnail_url": thumbnail_url,
514+
"thumbnail_url": thumbnail_url, # Can be None (use resource class), placeholder URL, or actual thumbnail URL
519515
"placeholder": is_placeholder,
520516
}
521517

@@ -529,17 +525,23 @@ async def get_resource_thumbnail(
529525
image_service._standardize_iiif_url(source_url) if source_url else None
530526
)
531527

532-
# If manifest, try to resolve to an image (network call; 2s timeout inside service)
528+
# If manifest, try to resolve from cache only (no blocking HTTP calls)
533529
resolved_url = None
534530
if (
535531
source_url
536532
and isinstance(source_url, str)
537-
and source_url.endswith(("/manifest", "manifest.json"))
533+
and image_service._is_manifest_url(source_url)
538534
):
539535
try:
540-
resolved_url = image_service.get_iiif_manifest_thumbnail(source_url)
541-
if resolved_url:
542-
resolved_url = image_service._standardize_iiif_url(resolved_url)
536+
# Only check cache - don't fetch manifests synchronously
537+
manifest_cache_key = f"manifest:{source_url}"
538+
cached_manifest_data = image_service.cache.get(manifest_cache_key)
539+
if cached_manifest_data:
540+
import json
541+
manifest_json = json.loads(cached_manifest_data)
542+
resolved_url = image_service._extract_thumbnail_from_manifest_json(manifest_json, source_url)
543+
if resolved_url:
544+
resolved_url = image_service._standardize_iiif_url(resolved_url)
543545
except Exception:
544546
resolved_url = None
545547

app/api/v1/endpoint_modules/search.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import logging
33
from typing import Annotated, Optional
44

5+
logger = logging.getLogger(__name__)
6+
57
from fastapi import APIRouter, Body, HTTPException, Query, Request
68
from fastapi.responses import JSONResponse
79
from sqlalchemy import select
@@ -13,6 +15,7 @@
1315
from app.api.v1.utils import (
1416
create_jsonapi_response,
1517
create_pagination_links,
18+
filter_empty_values,
1619
process_resource_optimized,
1720
sanitize_for_json,
1821
)
@@ -92,15 +95,23 @@ async def _handle_search(request: Request, params: dict) -> JSONResponse:
9295
for item in sanitized_results.get("data", []):
9396
rid = None
9497
score = None
98+
overlap = None
9599
if isinstance(item, dict):
96100
rid = (
97101
item.get("id")
98102
or item.get("attributes", {}).get("id")
99103
or item.get("attributes", {}).get("attributes", {}).get("id")
100104
)
101-
score = item.get("score") or item.get("attributes", {}).get("score")
105+
# Be careful not to treat 0.0 as falsy; only fall back on None
106+
score = item.get("score")
107+
if score is None:
108+
score = item.get("attributes", {}).get("score")
109+
110+
overlap = item.get("bbox_overlap_ratio")
111+
if overlap is None:
112+
overlap = item.get("attributes", {}).get("bbox_overlap_ratio")
102113
if rid:
103-
resource_data.append({"id": rid, "score": score})
114+
resource_data.append({"id": rid, "score": score, "bbox_overlap_ratio": overlap})
104115

105116
# Step 3: Batch fetch resource data
106117
async with async_session() as session:
@@ -117,6 +128,14 @@ async def _handle_search(request: Request, params: dict) -> JSONResponse:
117128
d = lookup.get(rd["id"]) or {}
118129
obj = await process_resource_optimized(d, {}, apply_field_mapping=False)
119130
mapped_attrs = OGMFieldMapper.map_resource_fields(obj.get("attributes", {}))
131+
# Attach ES scoring and overlap ratio info into per-resource meta for debugging
132+
if rd.get("score") is not None:
133+
obj.setdefault("meta", {})
134+
obj["meta"]["score"] = rd["score"]
135+
if rd.get("bbox_overlap_ratio") is not None:
136+
obj.setdefault("meta", {})
137+
obj["meta"]["bbox_overlap_ratio"] = rd["bbox_overlap_ratio"]
138+
120139
if isinstance(fields, str) and fields.strip():
121140
requested = [f.strip() for f in fields.split(",") if f.strip()]
122141
if "id" not in requested:
@@ -126,6 +145,9 @@ async def _handle_search(request: Request, params: dict) -> JSONResponse:
126145
obj["attributes"] = filtered_attrs
127146
else:
128147
obj["attributes"] = mapped_attrs
148+
149+
# Filter out empty arrays and empty strings from attributes
150+
obj["attributes"] = filter_empty_values(obj["attributes"])
129151
if not meta and "meta" in obj:
130152
obj.pop("meta", None)
131153
processed_resources.append(obj)
@@ -216,6 +238,21 @@ async def search(
216238
status_code=400,
217239
)
218240

241+
# Get the raw query string from the request scope before FastAPI parses it
242+
# This preserves bracket notation that FastAPI might filter from request.url.query
243+
raw_query_string = (
244+
request.scope.get("query_string", b"").decode("utf-8")
245+
if isinstance(request.scope.get("query_string"), bytes)
246+
else request.scope.get("query_string", "")
247+
)
248+
query_string = (
249+
raw_query_string
250+
if raw_query_string
251+
else (request.url.query if request.url.query else "")
252+
)
253+
logger.info(
254+
f"Search GET: raw_query_string length={len(raw_query_string)}, query_string length={len(query_string)}, sample={query_string[:300]}"
255+
)
219256
return await _handle_search(
220257
request,
221258
{
@@ -228,7 +265,7 @@ async def search(
228265
"facets": facets,
229266
"meta": meta,
230267
"callback": callback,
231-
"request_query_params": str(request.query_params),
268+
"request_query_params": query_string,
232269
"adv_q": parsed_adv_q,
233270
},
234271
)

0 commit comments

Comments
 (0)