Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions evals/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Evaluation of MIRIX on public benchmarks


1. Step 1:
Install uv with `brew install uv`, then run:
```
uv venv
source .venv/bin/activate
python -m ensurepip --upgrade
python -m pip install -r requirements.txt
```

2. Step 2:
Start the backend:
In the `MIRIX` folder, run:
```
uv run python scripts/start_server.py
```

3. Step 3:
In another terminal tab, run:
```
uv run python main_eval.py --limit 1 --run-llm --mirix_config_path ./configs/0201c.yaml --output_path results/0201c
```

4. Step 4:
Evaluation. Run:
```
uv run organize_results.py results/0201c
```

Then there would be `metrics.json` in `results/0201c` where you can see all the metrics.
187 changes: 187 additions & 0 deletions evals/clear_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env python3
"""
Script to remove 'records' key from JSON files in a specified folder.
If metrics.json exists, only removes records marked as WRONG.
If metrics.json doesn't exist, removes all records.

Usage:
python clear_records.py <folder_path>

Example:
python clear_records.py results/0201a
"""

import json
import sys
from pathlib import Path
from typing import Optional, Set, Tuple


def load_wrong_records(metrics_path: Path) -> Set[Tuple[str, int]]:
"""
Load metrics.json and extract sample_id and question_index for WRONG records.

Args:
metrics_path: Path to metrics.json file

Returns:
Set of (sample_id, question_index) tuples for wrong records
"""
try:
with open(metrics_path, 'r', encoding='utf-8') as f:
metrics = json.load(f)

wrong_records = set()
llm_judge_results = metrics.get('llm_judge_results', [])

for result in llm_judge_results:
if result.get('label') == 'WRONG' or result.get('score') == 0:
sample_id = result.get('sample_id')
question_index = result.get('question_index')
if sample_id and question_index:
wrong_records.add((sample_id, question_index))

return wrong_records
except Exception as e:
print(f" ✗ Error loading metrics.json: {e}")
return set()


def clear_records_from_file(file_path: Path, wrong_records: Optional[Set[Tuple[str, int]]] = None) -> bool:
"""
Remove 'records' key from a JSON file if it exists.
If wrong_records is provided, only removes those specific records.
If wrong_records is None, removes all records.

Args:
file_path: Path to the JSON file
wrong_records: Optional set of (sample_id, question_index) tuples to remove

Returns:
True if the file was modified, False otherwise
"""
try:
# Read the JSON file
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)

# Check if 'records' key exists
if 'records' in data:
sample_id = data.get('sample_id')

if wrong_records is None:
# Remove all records (original behavior)
print(f" Found 'records' key in {file_path.name}")
del data['records']
print(f" ✓ Removed all records from {file_path.name}")
modified = True
else:
# Remove only wrong records
original_count = len(data['records'])
records_to_keep = {}
removed_count = 0

for key, record in data['records'].items():
question_index = record.get('question_index')
if (sample_id, question_index) not in wrong_records:
records_to_keep[key] = record
else:
removed_count += 1

if removed_count > 0:
data['records'] = records_to_keep
print(f" ✓ Removed {removed_count}/{original_count} wrong records from {file_path.name}")
modified = True
else:
print(f" No wrong records found in {file_path.name}")
modified = False

# Write back to file if modified
if modified:
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)

return modified
else:
print(f" No 'records' key in {file_path.name}")
return False

except json.JSONDecodeError as e:
print(f" ✗ Error decoding JSON in {file_path.name}: {e}")
return False
except Exception as e:
print(f" ✗ Error processing {file_path.name}: {e}")
return False


def clear_records_from_folder(folder_path: str) -> None:
"""
Process all JSON files in a folder and remove 'records' key.
If metrics.json exists, only removes records marked as WRONG.
If metrics.json doesn't exist, removes all records.

Args:
folder_path: Path to the folder containing JSON files
"""
folder = Path(folder_path)

if not folder.exists():
print(f"Error: Folder '{folder_path}' does not exist")
sys.exit(1)

if not folder.is_dir():
print(f"Error: '{folder_path}' is not a directory")
sys.exit(1)

# Check for metrics.json
metrics_path = folder / 'metrics.json'
wrong_records = None

if metrics_path.exists():
print(f"Found metrics.json - will only remove WRONG records\n")
wrong_records = load_wrong_records(metrics_path)
if wrong_records:
print(f"Identified {len(wrong_records)} wrong record(s) to remove\n")
else:
print("No wrong records found in metrics.json\n")
else:
print(f"No metrics.json found - will remove all records\n")

# Find all JSON files in the folder (excluding metrics.json)
json_files = [f for f in folder.glob('*.json') if f.name != 'metrics.json']

if not json_files:
print(f"No JSON files found in '{folder_path}'")
return

print(f"Processing {len(json_files)} JSON file(s) in '{folder_path}'...\n")

modified_count = 0
for json_file in sorted(json_files):
if clear_records_from_file(json_file, wrong_records):
modified_count += 1

print(f"\n{'='*60}")
print(f"Summary:")
print(f" Total files processed: {len(json_files)}")
print(f" Files modified: {modified_count}")
print(f" Files unchanged: {len(json_files) - modified_count}")
if wrong_records is not None:
print(f" Wrong records removed: {len(wrong_records)}")
print(f"{'='*60}\n")


def main():
"""Main entry point."""
if len(sys.argv) != 2:
print("Usage: python clear_records.py <folder_path>")
print("\nExample:")
print(" python clear_records.py results/0201a")
sys.exit(1)

folder_path = sys.argv[1]
clear_records_from_folder(folder_path)


if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions evals/configs/0201c.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Mirix Configuration - OpenAI Setup
# Simple configuration using OpenAI's GPT-4o-mini

llm_config:
model: "gpt-4.1-mini"
model_endpoint_type: "openai"
api_key: your-api-key
model_endpoint: "https://api.openai.com/v1"
context_window: 128000

topic_extraction_llm_config:
model: "gpt-4.1-nano"
model_endpoint_type: "openai"
api_key: your-api-key
model_endpoint: "https://api.openai.com/v1"
context_window: 128000

embedding_config:
embedding_model: "text-embedding-3-small"
embedding_endpoint: "https://api.openai.com/v1"
api_key: your-api-key
embedding_endpoint_type: "openai"
embedding_dim: 1536

build_embeddings_for_memory: true

meta_agent_config:
system_prompts_folder: prompts/0201a/
agents:
- core_memory_agent
- resource_memory_agent
- semantic_memory_agent
- episodic_memory_agent
- procedural_memory_agent
- knowledge_vault_memory_agent
memory:
core:
- label: "human"
value: ""
- label: "persona"
value: "I am a helpful assistant."
134 changes: 134 additions & 0 deletions evals/llm_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import argparse
import json
from collections import defaultdict

import numpy as np
import openai
from openai import OpenAI

from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

ACCURACY_PROMPT = """
Your task is to label an answer to a question as ’CORRECT’ or ’WRONG’. You will be given the following data:
(1) a question (posed by one user to another user),
(2) a ’gold’ (ground truth) answer,
(3) a generated answer
which you will score as CORRECT/WRONG.

The point of the question is to ask about something one user should know about the other user based on their prior conversations.
The gold answer will usually be a concise and short answer that includes the referenced topic, for example:
Question: Do you remember what I got the last time I went to Hawaii?
Gold answer: A shell necklace
The generated answer might be much longer, but you should be generous with your grading - as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.

For time related questions, the gold answer will be a specific date, month, year, etc. The generated answer might be much longer or use relative time references (like "last Tuesday" or "next month"), but you should be generous with your grading - as long as it refers to the same date or time period as the gold answer, it should be counted as CORRECT. Even if the format differs (e.g., "May 7th" vs "7 May"), consider it CORRECT if it's the same date.

Now it’s time for the real question:
Question: {question}
Gold answer: {gold_answer}
Generated answer: {generated_answer}

First, provide a short (one sentence) explanation of your reasoning, then finish with CORRECT or WRONG.
Do NOT include both CORRECT and WRONG in your response, or it will break the evaluation script.

Just return the label CORRECT or WRONG in a json format with the key as "label".
"""


def evaluate_llm_judge(question, gold_answer, generated_answer):
"""Evaluate the generated answer against the gold answer using an LLM judge."""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "user",
"content": ACCURACY_PROMPT.format(
question=question, gold_answer=gold_answer, generated_answer=generated_answer
),
}
],
response_format={"type": "json_object"},
temperature=0.0,
)
label = json.loads(response.choices[0].message.content)["label"]
return 1 if label == "CORRECT" else 0


# def main():
# """Main function to evaluate RAG results using LLM judge."""
# parser = argparse.ArgumentParser(description="Evaluate RAG results using LLM judge")
# parser.add_argument(
# "--input_file",
# type=str,
# default="results/default_run_v4_k30_new_graph.json",
# help="Path to the input dataset file",
# )

# args = parser.parse_args()

# dataset_path = args.input_file
# output_path = f"results/llm_judge_{dataset_path.split('/')[-1]}"

# with open(dataset_path, "r") as f:
# data = json.load(f)

# LLM_JUDGE = defaultdict(list)
# RESULTS = defaultdict(list)

# index = 0
# for k, v in data.items():
# for x in v:
# question = x["question"]
# gold_answer = x["answer"]
# generated_answer = x["response"]
# category = x["category"]

# # Skip category 5
# if int(category) == 5:
# continue

# # Evaluate the answer
# label = evaluate_llm_judge(question, gold_answer, generated_answer)
# LLM_JUDGE[category].append(label)

# # Store the results
# RESULTS[index].append(
# {
# "question": question,
# "gt_answer": gold_answer,
# "response": generated_answer,
# "category": category,
# "llm_label": label,
# }
# )

# # Save intermediate results
# with open(output_path, "w") as f:
# json.dump(RESULTS, f, indent=4)

# # Print current accuracy for all categories
# print("All categories accuracy:")
# for cat, results in LLM_JUDGE.items():
# if results: # Only print if there are results for this category
# print(f" Category {cat}: {np.mean(results):.4f} " f"({sum(results)}/{len(results)})")
# print("------------------------------------------")
# index += 1

# # Save final results
# with open(output_path, "w") as f:
# json.dump(RESULTS, f, indent=4)

# # Print final summary
# print("PATH: ", dataset_path)
# print("------------------------------------------")
# for k, v in LLM_JUDGE.items():
# print(k, np.mean(v))


# if __name__ == "__main__":
# main()
Loading
Loading