virtUOS
diff --git a/‎.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions b/‎.vscode/launch.json‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎…luation/ragas_eval/tool_call_accuracy.py‎ ‎…l_agent/ragas_eval/tool_call_accuracy.py‎evaluation/ragas_eval/tool_call_accuracy.py renamed to eval_agent/ragas_eval/tool_call_accuracy.py b/‎…luation/ragas_eval/tool_call_accuracy.py‎ ‎…l_agent/ragas_eval/tool_call_accuracy.py‎evaluation/ragas_eval/tool_call_accuracy.py renamed to eval_agent/ragas_eval/tool_call_accuracy.py
diff --git a/‎…tion/ragas_eval/tool_call_accuracy_db.py‎ ‎…gent/ragas_eval/tool_call_accuracy_db.py‎evaluation/ragas_eval/tool_call_accuracy_db.py renamed to eval_agent/ragas_eval/tool_call_accuracy_db.py b/‎…tion/ragas_eval/tool_call_accuracy_db.py‎ ‎…gent/ragas_eval/tool_call_accuracy_db.py‎evaluation/ragas_eval/tool_call_accuracy_db.py renamed to eval_agent/ragas_eval/tool_call_accuracy_db.py
diff --git a/‎evaluation/ragas_eval/topic_adherence.py‎ ‎eval_agent/ragas_eval/topic_adherence.py‎evaluation/ragas_eval/topic_adherence.py renamed to eval_agent/ragas_eval/topic_adherence.py b/‎evaluation/ragas_eval/topic_adherence.py‎ ‎eval_agent/ragas_eval/topic_adherence.py‎evaluation/ragas_eval/topic_adherence.py renamed to eval_agent/ragas_eval/topic_adherence.py
diff --git a/‎eval/BERTscore_eval.py‎ ‎eval_generation/BERTscore_eval.py‎eval/BERTscore_eval.py renamed to eval_generation/BERTscore_eval.py b/‎eval/BERTscore_eval.py‎ ‎eval_generation/BERTscore_eval.py‎eval/BERTscore_eval.py renamed to eval_generation/BERTscore_eval.py
diff --git a/‎eval/LLM_as_Judge.py‎ ‎eval_generation/LLM_as_Judge.py‎eval/LLM_as_Judge.py renamed to eval_generation/LLM_as_Judge.py b/‎eval/LLM_as_Judge.py‎ ‎eval_generation/LLM_as_Judge.py‎eval/LLM_as_Judge.py renamed to eval_generation/LLM_as_Judge.py
diff --git a/‎…tomatic_eval/lexical_semantic_eval.ipynb‎ ‎…tomatic_eval/lexical_semantic_eval.ipynb‎eval/automatic_eval/lexical_semantic_eval.ipynb renamed to eval_generation/automatic_eval/lexical_semantic_eval.ipynb
Lines changed: 48 additions & 2 deletions b/‎…tomatic_eval/lexical_semantic_eval.ipynb‎ ‎…tomatic_eval/lexical_semantic_eval.ipynb‎eval/automatic_eval/lexical_semantic_eval.ipynb renamed to eval_generation/automatic_eval/lexical_semantic_eval.ipynb
Lines changed: 48 additions & 2 deletions
diff --git a/‎eval/bleu_eval.py‎ ‎eval_generation/bleu_eval.py‎eval/bleu_eval.py renamed to eval_generation/bleu_eval.py b/‎eval/bleu_eval.py‎ ‎eval_generation/bleu_eval.py‎eval/bleu_eval.py renamed to eval_generation/bleu_eval.py
diff --git a/‎eval/main.py‎ ‎eval_generation/main.py‎eval/main.py renamed to eval_generation/main.py
Lines changed: 46 additions & 8 deletions b/‎eval/main.py‎ ‎eval_generation/main.py‎eval/main.py renamed to eval_generation/main.py
Lines changed: 46 additions & 8 deletions
diff --git a/‎eval/requirements_eval.txt‎ ‎eval_generation/requirements_eval.txt‎eval/requirements_eval.txt renamed to eval_generation/requirements_eval.txt b/‎eval/requirements_eval.txt‎ ‎eval_generation/requirements_eval.txt‎eval/requirements_eval.txt renamed to eval_generation/requirements_eval.txt
@@ -53,10 +53,10 @@
         },
 
         {
-            "name": "eval_main",
+            "name": "eval_generation",
             "type": "debugpy",
             "request": "launch",
-            "program": "./eval/main.py",
+            "program": "./eval_generation/main.py",
             "cwd": "${workspaceFolder}",
             "env":{ "PYTHONPATH": "/app"},
             "console": "integratedTerminal",
 
@@ -1,10 +1,12 @@
+import datetime
 import json
 import os
+import subprocess
 
-from eval.BERTscore_eval import run_bertscore_eval
-from eval.bleu_eval import run_bleu_eval
-from eval.LLM_as_Judge import SemanticEvaluator
-from eval.rouge_eval import run_rouge_eval
+from eval_generation.BERTscore_eval import run_bertscore_eval
+from eval_generation.bleu_eval import run_bleu_eval
+from eval_generation.LLM_as_Judge import SemanticEvaluator
+from eval_generation.rouge_eval import run_rouge_eval
 
 
 def run_semantic_evaluation(save_results_path, test_data_path):
@@ -19,17 +21,45 @@ def load_config(config_path):
     return config
 
 
+def run_notebook_with_date(input_notebook_path, output_dir):
+    """
+    Executes a Jupyter notebook and saves the output with the current date in the filename.
+    """
+    date_str = datetime.datetime.now().strftime("%Y%m%d")
+    base_name = os.path.splitext(os.path.basename(input_notebook_path))[0]
+    output_notebook = f"{base_name}_{date_str}.ipynb"
+    output_path = os.path.join(output_dir, output_notebook)
+    # Run the notebook and save the output
+    subprocess.run(
+        [
+            "jupyter",
+            "nbconvert",
+            "--to",
+            "notebook",
+            "--execute",
+            "--output",
+            output_path,
+            input_notebook_path,
+        ],
+        check=True,
+    )
+    print(f"Executed notebook saved to: {output_path}")
+
+
 def main():
     cwd = os.getcwd()
     # Paths
-    test_data_path = os.path.join(cwd, "eval/data/test_samples_german_faq.csv")
-    results_dir = os.path.join(cwd, "eval/results")
+    test_data_path = os.path.join(
+        cwd, "eval_generation/data/test_samples_german_faq.csv"
+    )
+    date_str = datetime.datetime.now().strftime("%Y%m%d")
+    results_dir = os.path.join(cwd, f"eval_generation/results/results_{date_str}")
     os.makedirs(results_dir, exist_ok=True)
     llm_judge_results_path = os.path.join(results_dir, "llm_judge_results_de.csv")
 
     # 1. Run LLM as Judge and save results
     print("Running LLM as Judge (semantic evaluation)...")
-    # run_semantic_evaluation(llm_judge_results_path, test_data_path)
+    run_semantic_evaluation(llm_judge_results_path, test_data_path)
 
     # 2. Prepare config for metrics
     config = {
@@ -43,7 +73,7 @@ def main():
     }
 
     # save paths to json file. The file will be used by notebooks
-    config_path = os.path.join(cwd, "eval/result_paths.json")
+    config_path = os.path.join(cwd, "eval_generation/result_paths.json")
     with open(config_path, "w") as f:
         json.dump(config, f, indent=4)
 
@@ -61,6 +91,14 @@ def main():
 
     print("All metrics computed and saved.")
 
+    # 6. run notebooks for visualization
+    notebook_path = os.path.join(
+        cwd, "eval_generation/automatic_eval/lexical_semantic_eval.ipynb"
+    )
+    run_notebook_with_date(notebook_path, results_dir)
+
+    print("Evaluation pipeline completed.")
+
 
 if __name__ == "__main__":
     main()