Mirix-AI · wangyu-ustc · Apr 12, 2026 · Apr 10, 2026 · Apr 11, 2026
diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,32 @@
+# Evaluation of MIRIX on public benchmarks
+
+
+1. Step 1:
+Install uv with `brew install uv`, then run:
+```
+uv venv
+source .venv/bin/activate
+python -m ensurepip --upgrade
+python -m pip install -r requirements.txt
+```
+
+2. Step 2:
+Start the backend:
+In the `MIRIX` folder, run:
+```
+uv run python scripts/start_server.py
+```
+
+3. Step 3:
+In another terminal tab, run:
+```
+uv run python main_eval.py --limit 1 --run-llm --mirix_config_path ./configs/0201c.yaml --output_path results/0201c
+```
+
+4. Step 4:
+Evaluation. Run:
+```
+uv run organize_results.py results/0201c
+```
+
+Then there would be `metrics.json` in `results/0201c` where you can see all the metrics.
diff --git a/evals/clear_records.py b/evals/clear_records.py
@@ -0,0 +1,187 @@
+#!/usr/bin/env python3
+"""
+Script to remove 'records' key from JSON files in a specified folder.
+If metrics.json exists, only removes records marked as WRONG.
+If metrics.json doesn't exist, removes all records.
+
+Usage:
+    python clear_records.py <folder_path>
+
+Example:
+    python clear_records.py results/0201a
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import Optional, Set, Tuple
+
+
+def load_wrong_records(metrics_path: Path) -> Set[Tuple[str, int]]:
+    """
+    Load metrics.json and extract sample_id and question_index for WRONG records.
+
+    Args:
+        metrics_path: Path to metrics.json file
+
+    Returns:
+        Set of (sample_id, question_index) tuples for wrong records
+    """
+    try:
+        with open(metrics_path, 'r', encoding='utf-8') as f:
+            metrics = json.load(f)
+
+        wrong_records = set()
+        llm_judge_results = metrics.get('llm_judge_results', [])
+
+        for result in llm_judge_results:
+            if result.get('label') == 'WRONG' or result.get('score') == 0:
+                sample_id = result.get('sample_id')
+                question_index = result.get('question_index')
+                if sample_id and question_index:
+                    wrong_records.add((sample_id, question_index))
+
+        return wrong_records
+    except Exception as e:
+        print(f"  ✗ Error loading metrics.json: {e}")
+        return set()
+
+
+def clear_records_from_file(file_path: Path, wrong_records: Optional[Set[Tuple[str, int]]] = None) -> bool:
+    """
+    Remove 'records' key from a JSON file if it exists.
+    If wrong_records is provided, only removes those specific records.
+    If wrong_records is None, removes all records.
+
+    Args:
+        file_path: Path to the JSON file
+        wrong_records: Optional set of (sample_id, question_index) tuples to remove
+
+    Returns:
+        True if the file was modified, False otherwise
+    """
+    try:
+        # Read the JSON file
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+
+        # Check if 'records' key exists
+        if 'records' in data:
+            sample_id = data.get('sample_id')
+
+            if wrong_records is None:
+                # Remove all records (original behavior)
+                print(f"  Found 'records' key in {file_path.name}")
+                del data['records']
+                print(f"  ✓ Removed all records from {file_path.name}")
+                modified = True
+            else:
+                # Remove only wrong records
+                original_count = len(data['records'])
+                records_to_keep = {}
+                removed_count = 0
+
+                for key, record in data['records'].items():
+                    question_index = record.get('question_index')
+                    if (sample_id, question_index) not in wrong_records:
+                        records_to_keep[key] = record
+                    else:
+                        removed_count += 1
+
+                if removed_count > 0:
+                    data['records'] = records_to_keep
+                    print(f"  ✓ Removed {removed_count}/{original_count} wrong records from {file_path.name}")
+                    modified = True
+                else:
+                    print(f"  No wrong records found in {file_path.name}")
+                    modified = False
+
+            # Write back to file if modified
+            if modified:
+                with open(file_path, 'w', encoding='utf-8') as f:
+                    json.dump(data, f, indent=2, ensure_ascii=False)
+
+            return modified
+        else:
+            print(f"  No 'records' key in {file_path.name}")
+            return False
+
+    except json.JSONDecodeError as e:
+        print(f"  ✗ Error decoding JSON in {file_path.name}: {e}")
+        return False
+    except Exception as e:
+        print(f"  ✗ Error processing {file_path.name}: {e}")
+        return False
+
+
+def clear_records_from_folder(folder_path: str) -> None:
+    """
+    Process all JSON files in a folder and remove 'records' key.
+    If metrics.json exists, only removes records marked as WRONG.
+    If metrics.json doesn't exist, removes all records.
+
+    Args:
+        folder_path: Path to the folder containing JSON files
+    """
+    folder = Path(folder_path)
+
+    if not folder.exists():
+        print(f"Error: Folder '{folder_path}' does not exist")
+        sys.exit(1)
+
+    if not folder.is_dir():
+        print(f"Error: '{folder_path}' is not a directory")
+        sys.exit(1)
+
+    # Check for metrics.json
+    metrics_path = folder / 'metrics.json'
+    wrong_records = None
+
+    if metrics_path.exists():
+        print(f"Found metrics.json - will only remove WRONG records\n")
+        wrong_records = load_wrong_records(metrics_path)
+        if wrong_records:
+            print(f"Identified {len(wrong_records)} wrong record(s) to remove\n")
+        else:
+            print("No wrong records found in metrics.json\n")
+    else:
+        print(f"No metrics.json found - will remove all records\n")
+
+    # Find all JSON files in the folder (excluding metrics.json)
+    json_files = [f for f in folder.glob('*.json') if f.name != 'metrics.json']
+
+    if not json_files:
+        print(f"No JSON files found in '{folder_path}'")
+        return
+
+    print(f"Processing {len(json_files)} JSON file(s) in '{folder_path}'...\n")
+
+    modified_count = 0
+    for json_file in sorted(json_files):
+        if clear_records_from_file(json_file, wrong_records):
+            modified_count += 1
+
+    print(f"\n{'='*60}")
+    print(f"Summary:")
+    print(f"  Total files processed: {len(json_files)}")
+    print(f"  Files modified: {modified_count}")
+    print(f"  Files unchanged: {len(json_files) - modified_count}")
+    if wrong_records is not None:
+        print(f"  Wrong records removed: {len(wrong_records)}")
+    print(f"{'='*60}\n")
+
+
+def main():
+    """Main entry point."""
+    if len(sys.argv) != 2:
+        print("Usage: python clear_records.py <folder_path>")
+        print("\nExample:")
+        print("  python clear_records.py results/0201a")
+        sys.exit(1)
+
+    folder_path = sys.argv[1]
+    clear_records_from_folder(folder_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evals/configs/0201c.yaml b/evals/configs/0201c.yaml
@@ -0,0 +1,41 @@
+# Mirix Configuration - OpenAI Setup
+# Simple configuration using OpenAI's GPT-4o-mini
+
+llm_config:
+  model: "gpt-4.1-mini"
+  model_endpoint_type: "openai"
+  api_key: your-api-key
+  model_endpoint: "https://api.openai.com/v1"
+  context_window: 128000
+
+topic_extraction_llm_config:
+  model: "gpt-4.1-nano"
+  model_endpoint_type: "openai"
+  api_key: your-api-key
+  model_endpoint: "https://api.openai.com/v1"
+  context_window: 128000
+
+embedding_config:
+  embedding_model: "text-embedding-3-small"
+  embedding_endpoint: "https://api.openai.com/v1"
+  api_key: your-api-key
+  embedding_endpoint_type: "openai"
+  embedding_dim: 1536
+
+build_embeddings_for_memory: true
+
+meta_agent_config:
+  system_prompts_folder: prompts/0201a/
+  agents:
+    - core_memory_agent
+    - resource_memory_agent
+    - semantic_memory_agent
+    - episodic_memory_agent
+    - procedural_memory_agent
+    - knowledge_vault_memory_agent
+  memory:
+    core:
+      - label: "human"
+        value: ""
+      - label: "persona"
+        value: "I am a helpful assistant."
diff --git a/evals/llm_judge.py b/evals/llm_judge.py
@@ -0,0 +1,134 @@
+import argparse
+import json
+from collections import defaultdict
+
+import numpy as np
+import openai
+from openai import OpenAI
+
+from dotenv import load_dotenv
+import os
+
+load_dotenv()
+api_key = os.getenv("OPENAI_API_KEY")
+client = OpenAI(api_key=api_key)
+
+ACCURACY_PROMPT = """
+Your task is to label an answer to a question as ’CORRECT’ or ’WRONG’. You will be given the following data:
+    (1) a question (posed by one user to another user),
+    (2) a ’gold’ (ground truth) answer,
+    (3) a generated answer
+which you will score as CORRECT/WRONG.
+
+The point of the question is to ask about something one user should know about the other user based on their prior conversations.
+The gold answer will usually be a concise and short answer that includes the referenced topic, for example:
+Question: Do you remember what I got the last time I went to Hawaii?
+Gold answer: A shell necklace
+The generated answer might be much longer, but you should be generous with your grading - as long as it touches on the same topic as the gold answer, it should be counted as CORRECT.
+
+For time related questions, the gold answer will be a specific date, month, year, etc. The generated answer might be much longer or use relative time references (like "last Tuesday" or "next month"), but you should be generous with your grading - as long as it refers to the same date or time period as the gold answer, it should be counted as CORRECT. Even if the format differs (e.g., "May 7th" vs "7 May"), consider it CORRECT if it's the same date.
+
+Now it’s time for the real question:
+Question: {question}
+Gold answer: {gold_answer}
+Generated answer: {generated_answer}
+
+First, provide a short (one sentence) explanation of your reasoning, then finish with CORRECT or WRONG.
+Do NOT include both CORRECT and WRONG in your response, or it will break the evaluation script.
+
+Just return the label CORRECT or WRONG in a json format with the key as "label".
+"""
+
+
+def evaluate_llm_judge(question, gold_answer, generated_answer):
+    """Evaluate the generated answer against the gold answer using an LLM judge."""
+    response = client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "user",
+                "content": ACCURACY_PROMPT.format(
+                    question=question, gold_answer=gold_answer, generated_answer=generated_answer
+                ),
+            }
+        ],
+        response_format={"type": "json_object"},
+        temperature=0.0,
+    )
+    label = json.loads(response.choices[0].message.content)["label"]
+    return 1 if label == "CORRECT" else 0
+
+
+# def main():
+#     """Main function to evaluate RAG results using LLM judge."""
+#     parser = argparse.ArgumentParser(description="Evaluate RAG results using LLM judge")
+#     parser.add_argument(
+#         "--input_file",
+#         type=str,
+#         default="results/default_run_v4_k30_new_graph.json",
+#         help="Path to the input dataset file",
+#     )
+
+#     args = parser.parse_args()
+
+#     dataset_path = args.input_file
+#     output_path = f"results/llm_judge_{dataset_path.split('/')[-1]}"
+
+#     with open(dataset_path, "r") as f:
+#         data = json.load(f)
+
+#     LLM_JUDGE = defaultdict(list)
+#     RESULTS = defaultdict(list)
+
+#     index = 0
+#     for k, v in data.items():
+#         for x in v:
+#             question = x["question"]
+#             gold_answer = x["answer"]
+#             generated_answer = x["response"]
+#             category = x["category"]
+
+#             # Skip category 5
+#             if int(category) == 5:
+#                 continue
+
+#             # Evaluate the answer
+#             label = evaluate_llm_judge(question, gold_answer, generated_answer)
+#             LLM_JUDGE[category].append(label)
+
+#             # Store the results
+#             RESULTS[index].append(
+#                 {
+#                     "question": question,
+#                     "gt_answer": gold_answer,
+#                     "response": generated_answer,
+#                     "category": category,
+#                     "llm_label": label,
+#                 }
+#             )
+
+#             # Save intermediate results
+#             with open(output_path, "w") as f:
+#                 json.dump(RESULTS, f, indent=4)
+
+#             # Print current accuracy for all categories
+#             print("All categories accuracy:")
+#             for cat, results in LLM_JUDGE.items():
+#                 if results:  # Only print if there are results for this category
+#                     print(f"  Category {cat}: {np.mean(results):.4f} " f"({sum(results)}/{len(results)})")
+#             print("------------------------------------------")
+#         index += 1
+
+#     # Save final results
+#     with open(output_path, "w") as f:
+#         json.dump(RESULTS, f, indent=4)
+
+#     # Print final summary
+#     print("PATH: ", dataset_path)
+#     print("------------------------------------------")
+#     for k, v in LLM_JUDGE.items():
+#         print(k, np.mean(v))
+
+
+# if __name__ == "__main__":
+#     main()