Direct llm call#131
Conversation
There was a problem hiding this comment.
Code Review
This pull request introduces scripts and prompts for performing neuroscience-wide named-entity recognition (NER) using OpenAI and OpenRouter models, including support for local Grobid PDF parsing. The feedback highlights several critical improvements: resolving a configuration conflict in pyproject.toml where static dependencies are defined despite being marked as dynamic, safely handling potential null values in the LLM's JSON output to prevent TypeError crashes, and ensuring uploaded files are deleted from OpenAI's servers using a try...finally block to avoid exceeding storage quotas.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| dependencies = [ | ||
| "requests>=2.32.5", | ||
| ] |
There was a problem hiding this comment.
Since dependencies is declared as a dynamic field under [project] (i.e., dynamic = ["version", "dependencies"]), any static dependencies defined in [project.dependencies] will be ignored or cause build/locking issues with Poetry. Instead, the requests dependency should be added to [tool.poetry.dependencies].
| n_entities = len(payload.get("entities", [])) if isinstance(payload, dict) else 0 | ||
| n_terms = len(payload.get("key_terms", [])) if isinstance(payload, dict) else 0 |
There was a problem hiding this comment.
If the LLM returns null for "entities" or "key_terms" (which is common when no entities are found or if the generation is incomplete), payload.get("entities", []) will return None. Calling len(None) will then raise a TypeError and crash the script. It is safer to check the type of the returned value before calling len().
| n_entities = len(payload.get("entities", [])) if isinstance(payload, dict) else 0 | |
| n_terms = len(payload.get("key_terms", [])) if isinstance(payload, dict) else 0 | |
| entities_list = payload.get("entities") if isinstance(payload, dict) else None | |
| n_entities = len(entities_list) if isinstance(entities_list, list) else 0 | |
| terms_list = payload.get("key_terms") if isinstance(payload, dict) else None | |
| n_terms = len(terms_list) if isinstance(terms_list, list) else 0 |
| n_entities = len(payload.get("entities", [])) if isinstance(payload, dict) else 0 | ||
| n_terms = len(payload.get("key_terms", [])) if isinstance(payload, dict) else 0 |
There was a problem hiding this comment.
If the LLM returns null for "entities" or "key_terms", payload.get("entities", []) will return None. Calling len(None) will then raise a TypeError and crash the script. It is safer to check the type of the returned value before calling len().
| n_entities = len(payload.get("entities", [])) if isinstance(payload, dict) else 0 | |
| n_terms = len(payload.get("key_terms", [])) if isinstance(payload, dict) else 0 | |
| entities_list = payload.get("entities") if isinstance(payload, dict) else None | |
| n_entities = len(entities_list) if isinstance(entities_list, list) else 0 | |
| terms_list = payload.get("key_terms") if isinstance(payload, dict) else None | |
| n_terms = len(terms_list) if isinstance(terms_list, list) else 0 |
| vprint(f"Uploading file: {args.file} ...") | ||
| with open(args.file, "rb") as fh: | ||
| uploaded = client.files.create(file=fh, purpose="user_data") | ||
| vprint(f" uploaded (file_id={uploaded.id})") | ||
|
|
||
| vprint(f"Sending request to model '{args.model}' (streaming; this may take a while)...") | ||
| request_input = [ | ||
| { | ||
| "role": "user", | ||
| "content": [ | ||
| {"type": "input_file", "file_id": uploaded.id}, | ||
| { | ||
| "type": "input_text", | ||
| "text": USER_PROMPT.format(metadata_json=json.dumps(metadata)), | ||
| }, | ||
| ], | ||
| } | ||
| ] | ||
|
|
||
| # Only send sampling controls when explicitly requested; some reasoning | ||
| # models reject temperature/seed outright. | ||
| request_kwargs = {} | ||
| if args.temperature is not None: | ||
| request_kwargs["temperature"] = args.temperature | ||
| if args.seed is not None: | ||
| request_kwargs["seed"] = args.seed | ||
| if request_kwargs: | ||
| vprint(f" sampling controls: {request_kwargs}") | ||
|
|
||
| chunks = [] | ||
| chars = 0 | ||
| next_report = 2000 # print a progress line every ~2000 chars | ||
| with client.responses.stream( | ||
| model=args.model, | ||
| instructions=system_prompt, | ||
| input=request_input, | ||
| **request_kwargs, | ||
| ) as stream: | ||
| for event in stream: | ||
| if event.type == "response.output_text.delta": | ||
| chunks.append(event.delta) | ||
| chars += len(event.delta) | ||
| if chars >= next_report: | ||
| vprint(f" ...streaming, {chars} chars received so far") | ||
| next_report += 2000 | ||
| elif event.type == "error": | ||
| print(f" stream error: {event.error}") | ||
| final_response = stream.get_final_response() # surfaces any terminal API error |
There was a problem hiding this comment.
The uploaded file is persisted indefinitely on OpenAI's servers, which can eventually exhaust the user's file storage quota or accumulate unnecessary files. It is highly recommended to wrap the file upload and streaming process in a try...finally block to ensure the uploaded file is deleted after the request completes.
uploaded = None
try:
vprint(f"Uploading file: {args.file} ...")
with open(args.file, "rb") as fh:
uploaded = client.files.create(file=fh, purpose="user_data")
vprint(f" uploaded (file_id={uploaded.id})")
vprint(f"Sending request to model '{args.model}' (streaming; this may take a while)...")
request_input = [
{
"role": "user",
"content": [
{"type": "input_file", "file_id": uploaded.id},
{
"type": "input_text",
"text": USER_PROMPT.format(metadata_json=json.dumps(metadata)),
},
],
}
]
# Only send sampling controls when explicitly requested; some reasoning
# models reject temperature/seed outright.
request_kwargs = {}
if args.temperature is not None:
request_kwargs["temperature"] = args.temperature
if args.seed is not None:
request_kwargs["seed"] = args.seed
if request_kwargs:
vprint(f" sampling controls: {request_kwargs}")
chunks = []
chars = 0
next_report = 2000 # print a progress line every ~2000 chars
with client.responses.stream(
model=args.model,
instructions=system_prompt,
input=request_input,
**request_kwargs,
) as stream:
for event in stream:
if event.type == "response.output_text.delta":
chunks.append(event.delta)
chars += len(event.delta)
if chars >= next_report:
vprint(f" ...streaming, {chars} chars received so far")
next_report += 2000
elif event.type == "error":
print(f" stream error: {event.error}")
final_response = stream.get_final_response() # surfaces any terminal API error
finally:
if uploaded is not None:
vprint(f"Deleting uploaded file {uploaded.id} ...")
try:
client.files.delete(uploaded.id)
except Exception as exc:
print(f"WARNING: Failed to delete uploaded file {uploaded.id}: {exc}")| vprint(f" stamped source_metadata: {metadata}") | ||
|
|
||
| # Compute extraction statistics and stamp them into the metadata. | ||
| entities = payload.get("entities", []) or [] |
There was a problem hiding this comment.
Similarly, if "entities" is None or not a list, payload.get("entities", []) or [] can still result in None or an invalid type. Explicitly validating that entities is a list prevents potential runtime errors during iteration.
| entities = payload.get("entities", []) or [] | |
| entities = payload.get("entities") | |
| if not isinstance(entities, list): | |
| entities = [] |
|
|
||
| # Compute extraction statistics and stamp them into the metadata. | ||
| entities = payload.get("entities", []) or [] | ||
| label_counts = {} |
There was a problem hiding this comment.
Similarly, if "entities" is None or not a list, payload.get("entities", []) or [] can still result in None or an invalid type. Explicitly validating that entities is a list prevents potential runtime errors during iteration.
entities = payload.get("entities")
if not isinstance(entities, list):
entities = []
No description provided.