Skip to content

Commit 026b2aa

Browse files
authored
Merge pull request #2 from OpenTechBio/AgentBenchmarkingWithMemory
Adding Agent Benchmarking
2 parents 1c85dd2 + 7be1fe5 commit 026b2aa

19 files changed

Lines changed: 3960 additions & 0 deletions

benchmarking/.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
.env
2+
__pycache__/
3+
.DS_store
4+
outputs/

benchmarking/Evaluator.py

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
import json
2+
import os
3+
import sys
4+
import argparse
5+
from pathlib import Path
6+
from datetime import datetime
7+
import re
8+
9+
# --- Dependency Imports ---
10+
try:
11+
from dotenv import load_dotenv
12+
except ImportError:
13+
print("Error: python-dotenv library not found. Please install it: pip install python-dotenv", file=sys.stderr)
14+
sys.exit(1)
15+
16+
try:
17+
from openai import OpenAI, APIError
18+
except ImportError:
19+
print("Error: openai library not found. Please install it: pip install openai", file=sys.stderr)
20+
sys.exit(1)
21+
22+
# Optional: Use rich for better formatting
23+
try:
24+
from rich.console import Console
25+
from rich.prompt import Prompt, Confirm
26+
from rich.panel import Panel
27+
HAS_RICH = True
28+
console = Console()
29+
except ImportError:
30+
HAS_RICH = False
31+
console = None
32+
# Simple print/input fallback if rich is not installed
33+
class Console:
34+
def print(self, *args, **kwargs): print(*args)
35+
class Prompt:
36+
@staticmethod
37+
def ask(prompt, default=None):
38+
p_text = f"{prompt} "
39+
if default: p_text += f"[{default}] "
40+
return input(p_text).strip()
41+
class Confirm:
42+
@staticmethod
43+
def ask(prompt, default=False):
44+
val = input(f"{prompt} [y/N] " if not default else f"{prompt} [Y/n] ").lower().strip()
45+
if not val: return default
46+
return val == 'y'
47+
class Panel:
48+
def __init__(self, content, title="", border_style=""): self.content=str(content); self.title=title
49+
def __rich_console__(self, console, options): yield self.title; yield self.content
50+
51+
52+
# --- Constants ---
53+
SCRIPT_DIR = Path(__file__).parent.resolve()
54+
DEFAULT_INPUT_DIR = SCRIPT_DIR / "outputs"
55+
DEFAULT_OUTPUT_DIR = SCRIPT_DIR / "outputs" # Default to save back into input dir
56+
ENV_FILE = SCRIPT_DIR / ".env"
57+
58+
# --- Configuration Loading ---
59+
load_dotenv(dotenv_path=ENV_FILE)
60+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
61+
OPENAI_MODEL = "gpt-4o" # Or your preferred model for evaluation
62+
63+
if not OPENAI_API_KEY:
64+
if console: console.print(f"[bold red]Error:[/bold red] OPENAI_API_KEY not found in {ENV_FILE}.")
65+
else: print(f"Error: OPENAI_API_KEY not found in {ENV_FILE}.")
66+
sys.exit(1)
67+
68+
try:
69+
openai_client = OpenAI(api_key=OPENAI_API_KEY)
70+
if console: console.print(f"OpenAI client initialized for model [cyan]{OPENAI_MODEL}[/cyan].")
71+
else: print(f"OpenAI client initialized for model {OPENAI_MODEL}.")
72+
except Exception as e:
73+
if console: console.print(f"[bold red]Error initializing OpenAI client:[/bold red] {e}")
74+
else: print(f"Error initializing OpenAI client: {e}")
75+
sys.exit(1)
76+
77+
# --- Helper Functions ---
78+
79+
def format_conversation_for_eval(test_data):
80+
""" Formats the conversation turns into a readable string for the evaluator prompt. """
81+
if not test_data or "turns" not in test_data:
82+
return "[No conversation turns found]"
83+
84+
formatted_lines = []
85+
for turn in test_data.get("turns", []):
86+
role = turn.get("role", "unknown").upper()
87+
content = turn.get("content", "[No content]")
88+
89+
# Shorten system prompt for brevity in evaluation context if desired
90+
if role == "SYSTEM":
91+
# Extract key parts or just indicate system prompt presence
92+
content = "[System Prompt Provided - see original log for details]"
93+
# Or keep it: content = turn.get("content", "[No content]")
94+
95+
# Format code execution results more clearly if they are part of user turn
96+
if role == "USER" and content.startswith("Code execution result:"):
97+
# Reformat slightly for clarity
98+
content = content.replace("Code execution result:", "**CODE EXECUTION RESULT:**")
99+
content = content.replace("--- STDOUT ---", "**STDOUT:**")
100+
content = content.replace("--- STDERR ---", "**STDERR:**")
101+
content = content.replace("--------------", "---") # Shorten separator
102+
103+
formatted_lines.append(f"--- {role} ---")
104+
formatted_lines.append(content)
105+
formatted_lines.append("\n") # Add space between turns
106+
107+
return "\n".join(formatted_lines)
108+
109+
110+
def call_openai_evaluator(conversation_text, context):
111+
""" Sends the formatted conversation to OpenAI for evaluation. """
112+
evaluator_prompt = f"""You are an expert evaluator assessing the performance of an AI assistant acting as a bioinformatician.
113+
The assistant was given a task related to analyzing a single-cell transcriptomics dataset.
114+
The expected performance level is that of an **entry-level post-graduate bioinformatician**.
115+
116+
**Dataset Context:**
117+
- Dataset File: {context.get('dataset_file', 'N/A')}
118+
- Key Metadata: {json.dumps(context.get('dataset_metadata', {}), indent=1, default=str)}
119+
120+
**Task Context:**
121+
- Initial User Prompt: See the first USER prompt below.
122+
- Max Code Attempts Allowed: {context.get('max_code_tries', 'N/A')}
123+
124+
**Conversation Log:**
125+
{conversation_text}
126+
127+
**Evaluation Task:**
128+
Based on the conversation log, evaluate the AI assistant's performance. Consider the following:
129+
1. **Correctness:** Was the generated code correct and did it achieve the intended analysis steps?
130+
2. **Efficiency:** Was the approach reasonable? Were there unnecessary steps?
131+
3. **Interpretation:** Did the assistant correctly interpret the results of its code execution?
132+
4. **Planning:** Did the assistant use its allowed code execution attempts effectively towards the goal?
133+
5. **Clarity:** Was the assistant's text explanation clear and accurate?
134+
6. **Overall Skill:** Does the performance align with an entry-level post-graduate bioinformatician?
135+
136+
**Output Format:**
137+
Please provide your evaluation strictly in the following JSON format ONLY. Do not include any other text before or after the JSON block:
138+
{{
139+
"grade": <integer between 0 and 100>,
140+
"comments": "<string containing your detailed evaluation justifying the grade>"
141+
}}
142+
"""
143+
144+
if console: console.print(f"Sending evaluation request for context: {context.get('prompt_id', 'unknown')[:20]}...")
145+
else: print(f"Sending evaluation request for context: {context.get('prompt_id', 'unknown')[:20]}...")
146+
147+
try:
148+
response = openai_client.chat.completions.create(
149+
model=OPENAI_MODEL,
150+
messages=[
151+
# Maybe a short system message for the evaluator role itself?
152+
# {"role": "system", "content": "You are an expert evaluator."},
153+
{"role": "user", "content": evaluator_prompt}
154+
],
155+
temperature=0.3, # Lower temperature for more deterministic evaluation
156+
response_format={"type": "json_object"}, # Request JSON output
157+
max_tokens=1000 # Adjust as needed for comments length
158+
)
159+
eval_content = response.choices[0].message.content
160+
if console: console.print("[green]Evaluation received from OpenAI.[/green]")
161+
else: print("Evaluation received from OpenAI.")
162+
163+
# Attempt to parse the JSON response
164+
try:
165+
eval_json = json.loads(eval_content)
166+
# Validate expected keys
167+
if "grade" in eval_json and "comments" in eval_json:
168+
# Basic type check (can be more robust)
169+
if isinstance(eval_json["grade"], int) and isinstance(eval_json["comments"], str):
170+
return eval_json
171+
else:
172+
raise ValueError("Incorrect data types for 'grade' or 'comments'.")
173+
else:
174+
raise ValueError("Missing 'grade' or 'comments' key in JSON response.")
175+
except (json.JSONDecodeError, ValueError) as e:
176+
if console: console.print(f"[bold red]Error parsing evaluation JSON from OpenAI: {e}[/bold red]")
177+
else: print(f"Error parsing evaluation JSON from OpenAI: {e}")
178+
if console: console.print(f"Raw response content:\n{eval_content}")
179+
else: print(f"Raw response content:\n{eval_content}")
180+
# Return a structured error
181+
return {"grade": -1, "comments": f"Error parsing OpenAI response: {e}\nRaw Content: {eval_content}"}
182+
183+
except APIError as e:
184+
if console: console.print(f"[bold red]OpenAI API Error during evaluation: {e}[/bold red]")
185+
else: print(f"OpenAI API Error during evaluation: {e}")
186+
return {"grade": -1, "comments": f"OpenAI API Error: {e}"}
187+
except Exception as e:
188+
if console: console.print(f"[bold red]Unexpected error during evaluation call: {e}[/bold red]")
189+
else: print(f"Unexpected error during evaluation call: {e}")
190+
import traceback
191+
traceback.print_exc()
192+
return {"grade": -1, "comments": f"Unexpected Error: {e}"}
193+
194+
195+
def process_folder(input_dir_path, output_path):
196+
"""Finds JSON files, gets evaluations, and saves them."""
197+
evaluations = {}
198+
json_files = list(input_dir_path.glob("*.json"))
199+
200+
if not json_files:
201+
if console: console.print(f"[yellow]No JSON files found in '{input_dir_path}'.[/yellow]")
202+
else: print(f"No JSON files found in '{input_dir_path}'.")
203+
return
204+
205+
if console: console.print(f"Found {len(json_files)} JSON file(s) to evaluate.")
206+
else: print(f"Found {len(json_files)} JSON file(s) to evaluate.")
207+
208+
for json_file in json_files:
209+
if console: console.print(f"\n--- Processing: [cyan]{json_file.name}[/cyan] ---")
210+
else: print(f"\n--- Processing: {json_file.name} ---")
211+
try:
212+
with open(json_file, 'r', encoding='utf-8') as f:
213+
results_data = json.load(f)
214+
215+
# Process each test run within the file (assuming structure {test_id: test_data})
216+
file_evaluations = {}
217+
for test_id, test_data in results_data.items():
218+
if not isinstance(test_data, dict) or "context" not in test_data or "turns" not in test_data:
219+
if console: console.print(f"[yellow]Skipping invalid/incomplete data for test ID '{test_id}' in {json_file.name}.[/yellow]")
220+
else: print(f"Skipping invalid/incomplete data for test ID '{test_id}' in {json_file.name}.")
221+
continue
222+
223+
conversation_text = format_conversation_for_eval(test_data)
224+
context = test_data.get("context", {})
225+
evaluation = call_openai_evaluator(conversation_text, context)
226+
file_evaluations[test_id] = evaluation # Store evaluation keyed by test_id
227+
228+
# Store evaluations for this file, keyed by the original filename stem
229+
evaluations[json_file.stem] = file_evaluations
230+
231+
except json.JSONDecodeError:
232+
if console: console.print(f"[red]Error decoding JSON from {json_file.name}. Skipping.[/red]")
233+
else: print(f"Error decoding JSON from {json_file.name}. Skipping.")
234+
except Exception as e:
235+
if console: console.print(f"[red]Error processing file {json_file.name}: {e}[/red]")
236+
else: print(f"Error processing file {json_file.name}: {e}")
237+
238+
# --- Save Evaluations ---
239+
if not evaluations:
240+
if console: console.print("[yellow]No evaluations were generated.[/yellow]")
241+
else: print("No evaluations were generated.")
242+
return
243+
244+
output_path = Path(output_path) # Ensure it's a Path object
245+
246+
# Check if output is a directory or file
247+
if output_path.suffix == ".json":
248+
# Save all evaluations to a single specified file
249+
output_filename = output_path
250+
if console: console.print(f"\nSaving all evaluations to single file: [cyan]{output_filename}[/cyan]")
251+
else: print(f"\nSaving all evaluations to single file: {output_filename}")
252+
try:
253+
output_path.parent.mkdir(parents=True, exist_ok=True) # Ensure parent dir exists
254+
with open(output_filename, "w", encoding="utf-8") as f:
255+
json.dump(evaluations, f, indent=2)
256+
if console: console.print("[green]Evaluations saved successfully.[/green]")
257+
else: print("Evaluations saved successfully.")
258+
except Exception as e:
259+
if console: console.print(f"[bold red]Error saving aggregated evaluations to {output_filename}:[/bold red] {e}")
260+
else: print(f"Error saving aggregated evaluations to {output_filename}: {e}")
261+
else:
262+
# Save evaluations to individual files in the specified directory
263+
output_dir = output_path
264+
output_dir.mkdir(parents=True, exist_ok=True) # Ensure dir exists
265+
if console: console.print(f"\nSaving evaluations to directory: [cyan]{output_dir}[/cyan]")
266+
else: print(f"\nSaving evaluations to directory: {output_dir}")
267+
for input_stem, file_evals in evaluations.items():
268+
output_filename = output_dir / f"{input_stem}_eval.json"
269+
try:
270+
with open(output_filename, "w", encoding="utf-8") as f:
271+
json.dump(file_evals, f, indent=2)
272+
if console: console.print(f" Saved: [green]{output_filename.name}[/green]")
273+
else: print(f" Saved: {output_filename.name}")
274+
except Exception as e:
275+
if console: console.print(f" [red]Error saving evaluation for {input_stem}: {e}[/red]")
276+
else: print(f" Error saving evaluation for {input_stem}: {e}")
277+
278+
279+
def interactive_loop():
280+
"""Handles the interactive user prompts."""
281+
if console: console.print("\n--- Agent Benchmark Evaluator ---")
282+
else: print("\n--- Agent Benchmark Evaluator ---")
283+
284+
# Get input directory
285+
default_input = str(DEFAULT_INPUT_DIR.resolve())
286+
while True:
287+
if console: input_dir_str = Prompt.ask("Enter path to input folder containing results JSONs", default=default_input)
288+
else: input_dir_str = input(f"Enter path to input folder containing results JSONs [{default_input}]: ").strip() or default_input
289+
290+
input_dir_path = Path(input_dir_str).resolve()
291+
if input_dir_path.is_dir():
292+
break
293+
else:
294+
if console: console.print(f"[red]Error: Input path '{input_dir_path}' is not a valid directory.[/red]")
295+
else: print(f"Error: Input path '{input_dir_path}' is not a valid directory.")
296+
297+
# Get output path (directory or specific file)
298+
default_output = str(input_dir_path) # Default output to input dir
299+
if console: output_path_str = Prompt.ask("Enter output directory or specific .json filename for results", default=default_output)
300+
else: output_path_str = input(f"Enter output directory or specific .json filename for results [{default_output}]: ").strip() or default_output
301+
302+
process_folder(input_dir_path, output_path_str)
303+
304+
305+
# --- Main Execution ---
306+
if __name__ == "__main__":
307+
interactive_loop()

0 commit comments

Comments
 (0)