Skip to content

Commit e5f1cec

Browse files
authored
Merge pull request #1 from OMOPHub/semantic-search
Semantic search support
2 parents e682c2f + 3b571e1 commit e5f1cec

8 files changed

Lines changed: 1068 additions & 6 deletions

File tree

.github/FUNDING.yml

Lines changed: 0 additions & 3 deletions
This file was deleted.

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
.Ruserdata
77
*.Rcheck/
88
*.tar.gz
9+
dump.rdb
910

1011
# Build artifacts
1112
docs/

R/request.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ build_request <- function(base_url, api_key, timeout = 30, max_retries = 3,
2020
"Content-Type" = "application/json",
2121
"Accept" = "application/json"
2222
) |>
23-
httr2::req_user_agent("OMOPHub-SDK-R/1.0.0") |>
23+
httr2::req_user_agent(paste0("OMOPHub-SDK-R/", utils::packageVersion("omophub"))) |>
2424
httr2::req_timeout(timeout) |>
2525
httr2::req_throttle(
2626
rate = .omophub_env$rate_limit_capacity / .omophub_env$rate_limit_fill_time

R/search.R

Lines changed: 194 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ SearchResource <- R6::R6Class(
161161
page = 1,
162162
page_size = 20) {
163163
checkmate::assert_string(query, min.chars = 1)
164+
checkmate::assert_integerish(page, lower = 1, len = 1, any.missing = FALSE)
165+
checkmate::assert_integerish(page_size, lower = 1, upper = 1000, len = 1, any.missing = FALSE)
164166

165167
body <- list(query = query)
166168

@@ -206,6 +208,7 @@ SearchResource <- R6::R6Class(
206208
domains = NULL,
207209
max_suggestions = 10) {
208210
checkmate::assert_string(query, min.chars = 1)
211+
checkmate::assert_integerish(max_suggestions, lower = 1, len = 1, any.missing = FALSE)
209212

210213
params <- list(
211214
query = query,
@@ -222,11 +225,201 @@ SearchResource <- R6::R6Class(
222225
perform_get(private$.base_req, "search/suggest", query = params)
223226
},
224227

228+
#' @description
229+
#' Semantic concept search using neural embeddings.
230+
#'
231+
#' @param query Natural language search query (required).
232+
#' @param vocabulary_ids Filter by vocabulary IDs.
233+
#' @param domain_ids Filter by domain IDs.
234+
#' @param standard_concept Filter by standard concept ('S' or 'C').
235+
#' @param concept_class_id Filter by concept class ID.
236+
#' @param threshold Minimum similarity threshold (0.0-1.0, default 0.5).
237+
#' @param page Page number (1-based). Default 1.
238+
#' @param page_size Results per page (max 100). Default 20.
239+
#'
240+
#' @returns List with results and pagination metadata.
241+
semantic = function(query,
242+
vocabulary_ids = NULL,
243+
domain_ids = NULL,
244+
standard_concept = NULL,
245+
concept_class_id = NULL,
246+
threshold = NULL,
247+
page = 1,
248+
page_size = 20) {
249+
checkmate::assert_string(query, min.chars = 1)
250+
pag <- validate_pagination(page, page_size, max_page_size = 100)
251+
252+
params <- list(
253+
query = query,
254+
page = pag$page,
255+
page_size = pag$page_size
256+
)
257+
258+
if (!is.null(vocabulary_ids)) {
259+
params$vocabulary_ids <- join_params(vocabulary_ids)
260+
}
261+
if (!is.null(domain_ids)) {
262+
params$domain_ids <- join_params(domain_ids)
263+
}
264+
if (!is.null(standard_concept)) {
265+
checkmate::assert_choice(standard_concept, c("S", "C"))
266+
params$standard_concept <- standard_concept
267+
}
268+
if (!is.null(concept_class_id)) {
269+
params$concept_class_id <- concept_class_id
270+
}
271+
if (!is.null(threshold)) {
272+
checkmate::assert_number(threshold, lower = 0, upper = 1)
273+
params$threshold <- threshold
274+
}
275+
276+
perform_get(private$.base_req, "concepts/semantic-search", query = params)
277+
},
278+
279+
#' @description
280+
#' Fetch all semantic search results with automatic pagination.
281+
#'
282+
#' @param query Natural language search query (required).
283+
#' @param vocabulary_ids Filter by vocabulary IDs.
284+
#' @param domain_ids Filter by domain IDs.
285+
#' @param standard_concept Filter by standard concept ('S' or 'C').
286+
#' @param concept_class_id Filter by concept class ID.
287+
#' @param threshold Minimum similarity threshold (0.0-1.0).
288+
#' @param page_size Results per page. Default 100.
289+
#' @param max_pages Maximum pages to fetch. Default Inf.
290+
#' @param progress Show progress bar. Default `TRUE`.
291+
#'
292+
#' @returns A tibble of all matching concepts with similarity scores.
293+
semantic_all = function(query,
294+
vocabulary_ids = NULL,
295+
domain_ids = NULL,
296+
standard_concept = NULL,
297+
concept_class_id = NULL,
298+
threshold = NULL,
299+
page_size = 100,
300+
max_pages = Inf,
301+
progress = TRUE) {
302+
fetch_fn <- function(page, size) {
303+
result <- self$semantic(
304+
query = query,
305+
vocabulary_ids = vocabulary_ids,
306+
domain_ids = domain_ids,
307+
standard_concept = standard_concept,
308+
concept_class_id = concept_class_id,
309+
threshold = threshold,
310+
page = page,
311+
page_size = size
312+
)
313+
# Handle different response structures
314+
if (is.list(result) && "data" %in% names(result) && "meta" %in% names(result)) {
315+
data <- result$data
316+
if (is.list(data) && "results" %in% names(data)) {
317+
data <- data$results
318+
}
319+
list(data = data, meta = result$meta)
320+
} else if (is.list(result) && "results" %in% names(result)) {
321+
list(data = result$results, meta = result$meta %||% list())
322+
} else {
323+
list(data = result %||% list(), meta = list())
324+
}
325+
}
326+
327+
paginate_all(fetch_fn, page_size = page_size, max_pages = max_pages, progress = progress)
328+
},
329+
330+
#' @description
331+
#' Find concepts similar to a reference concept or query.
332+
#'
333+
#' Must provide exactly one of: concept_id, concept_name, or query.
334+
#'
335+
#' @param concept_id Concept ID to find similar concepts for.
336+
#' @param concept_name Concept name to find similar concepts for.
337+
#' @param query Natural language query for semantic similarity.
338+
#' @param algorithm One of 'semantic', 'lexical', or 'hybrid' (default).
339+
#' @param similarity_threshold Minimum similarity (0.0-1.0). Default 0.7.
340+
#' @param page_size Max results (max 1000). Default 20.
341+
#' @param vocabulary_ids Filter by vocabulary IDs.
342+
#' @param domain_ids Filter by domain IDs.
343+
#' @param standard_concept Filter by standard concept flag ('S', 'C', or 'N').
344+
#' @param include_invalid Include invalid/deprecated concepts.
345+
#' @param include_scores Include detailed similarity scores.
346+
#' @param include_explanations Include similarity explanations.
347+
#'
348+
#' @returns List with similar_concepts and search_metadata.
349+
#'
350+
#' @note When algorithm='semantic', only single vocabulary/domain filter supported.
351+
similar = function(concept_id = NULL,
352+
concept_name = NULL,
353+
query = NULL,
354+
algorithm = "hybrid",
355+
similarity_threshold = 0.7,
356+
page_size = 20,
357+
vocabulary_ids = NULL,
358+
domain_ids = NULL,
359+
standard_concept = NULL,
360+
include_invalid = NULL,
361+
include_scores = NULL,
362+
include_explanations = NULL) {
363+
# Validate exactly one of concept_id, concept_name, or query provided
364+
provided <- sum(!is.null(concept_id), !is.null(concept_name), !is.null(query))
365+
if (provided != 1) {
366+
cli::cli_abort(
367+
"Exactly one of {.arg concept_id}, {.arg concept_name}, or {.arg query} must be provided"
368+
)
369+
}
370+
371+
checkmate::assert_choice(algorithm, c("semantic", "lexical", "hybrid"))
372+
checkmate::assert_number(similarity_threshold, lower = 0, upper = 1)
373+
checkmate::assert_integerish(page_size, lower = 1, upper = 1000)
374+
if (!is.null(concept_id)) {
375+
checkmate::assert_integerish(concept_id, len = 1, any.missing = FALSE)
376+
}
377+
378+
body <- list(
379+
algorithm = algorithm,
380+
similarity_threshold = similarity_threshold
381+
)
382+
383+
if (!is.null(concept_id)) {
384+
body$concept_id <- as.integer(concept_id)
385+
}
386+
if (!is.null(concept_name)) {
387+
body$concept_name <- concept_name
388+
}
389+
if (!is.null(query)) {
390+
body$query <- query
391+
}
392+
if (page_size != 20) {
393+
body$page_size <- as.integer(page_size)
394+
}
395+
if (!is.null(vocabulary_ids)) {
396+
body$vocabulary_ids <- as.list(vocabulary_ids)
397+
}
398+
if (!is.null(domain_ids)) {
399+
body$domain_ids <- as.list(domain_ids)
400+
}
401+
if (!is.null(standard_concept)) {
402+
checkmate::assert_choice(standard_concept, c("S", "C", "N"))
403+
body$standard_concept <- standard_concept
404+
}
405+
if (!is.null(include_invalid)) {
406+
body$include_invalid <- include_invalid
407+
}
408+
if (!is.null(include_scores)) {
409+
body$include_scores <- include_scores
410+
}
411+
if (!is.null(include_explanations)) {
412+
body$include_explanations <- include_explanations
413+
}
414+
415+
perform_post(private$.base_req, "search/similar", body = body)
416+
},
417+
225418
#' @description
226419
#' Print resource information.
227420
print = function() {
228421
cat("<OMOPHub SearchResource>\n")
229-
cat(" Methods: basic, basic_all, advanced, autocomplete\n")
422+
cat(" Methods: basic, basic_all, advanced, autocomplete, semantic, semantic_all, similar\n")
230423
invisible(self)
231424
}
232425
),

README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,29 @@ mappings <- client$mappings$get(201826, target_vocabulary = "ICD10CM")
8282
ancestors <- client$hierarchy$ancestors(201826, max_levels = 3)
8383
```
8484

85+
## Semantic Search
86+
87+
Use natural language queries to find concepts using neural embeddings:
88+
89+
```r
90+
# Natural language search - understands clinical intent
91+
results <- client$search$semantic("high blood sugar levels")
92+
for (r in results$data$results) {
93+
cat(sprintf("%s (similarity: %.2f)\n", r$concept_name, r$similarity_score))
94+
}
95+
96+
# Filter by vocabulary and set minimum similarity threshold
97+
results <- client$search$semantic(
98+
"heart attack",
99+
vocabulary_ids = "SNOMED",
100+
domain_ids = "Condition",
101+
threshold = 0.5
102+
)
103+
104+
# Fetch all results with auto-pagination
105+
all_results <- client$search$semantic_all("chronic kidney disease", page_size = 50)
106+
```
107+
85108
## Use Cases
86109

87110
### ETL & Data Pipelines
@@ -169,7 +192,7 @@ concepts_df %>%
169192
| Resource | Description | Key Methods |
170193
|----------|-------------|-------------|
171194
| `concepts` | Concept lookup and batch operations | `get()`, `get_by_code()`, `batch()`, `suggest()` |
172-
| `search` | Full-text and semantic search | `basic()`, `advanced()`, `basic_all()` |
195+
| `search` | Full-text and semantic search | `basic()`, `advanced()`, `semantic()`, `semantic_all()`, `basic_all()` |
173196
| `hierarchy` | Navigate concept relationships | `ancestors()`, `descendants()` |
174197
| `mappings` | Cross-vocabulary mappings | `get()`, `map()` |
175198
| `vocabularies` | Vocabulary metadata | `list()`, `get()`, `stats()` |

0 commit comments

Comments
 (0)