Modalities · le1nux · May 6, 2025 · May 6, 2025 · May 6, 2025 · May 9, 2025
diff --git a/notebooks/edu_content_human_as_a_judge.ipynb b/notebooks/edu_content_human_as_a_judge.ipynb
diff --git a/src/ml_filter/__init__.py b/src/ml_filter/__init__.py
diff --git a/src/ml_filter/analysis/interrater_reliability.py b/src/ml_filter/analysis/interrater_reliability.py
@@ -10,7 +10,7 @@
 import numpy as np
 import pandas as pd
 from scipy.stats import kendalltau, spearmanr
-from sklearn.metrics import cohen_kappa_score, f1_score
+from sklearn.metrics import cohen_kappa_score, f1_score, ndcg_score, precision_score, recall_score
 from statsmodels.stats.inter_rater import fleiss_kappa
 
 from ml_filter.analysis.plot_score_distributions import plot_confusion_matrix
@@ -189,7 +189,33 @@ def compute_gt_metrics(
     # Othwerwise, zipping will will proive the wrong results
     class_f1_scores = f1_score(ground_truth_rounded, predictions_rounded, average=None, labels=valid_labels)
     for valid_label, f1 in zip(valid_labels, class_f1_scores):
-        gt_metrics[f"F1-{valid_label}"] = f1
+        gt_metrics[f"F1-{valid_label}_vs_rest"] = f1
+
+    # f1 score at threshold
+    for t in np.array(list(range(5))) + 0.5:
+        ground_truth_rounded_bin = (np.array(ground_truth_rounded) >= t).astype(int)
+        predictions_rounded_bin = (np.array(predictions_rounded) >= t).astype(int)
+        gt_metrics[f"F1-{t}"] = f1_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+        gt_metrics[f"Recall-{t}"] = recall_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+        gt_metrics[f"Precision-{t}"] = precision_score(
+            ground_truth_rounded_bin,
+            predictions_rounded_bin,
+            labels=[int(valid_label) for valid_label in valid_labels],
+            zero_division=0,
+        )
+
+    # NDCG@all
+    gt_metrics["NDCG@all"] = ndcg_score(y_true=[ground_truth_scores], y_score=[predicted_scores], k=None)
 
     return gt_metrics
 
@@ -215,7 +241,7 @@ def plot_invalid_docs_histogram(
     plt.hist(correct_scores_of_invalid_docs, bins=[0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5], alpha=0.5, edgecolor="black")
     plt.xlabel("Scores")
     plt.ylabel("Frequency")
-    plt.title(f"Histogram of Invalid Scores for {annotator_name} and langauge {language}.")
+    plt.title(f"Histogram of invalid scores for {annotator_name} and language {language}.")
     plt.grid(True)
     plt.savefig(output_file_path)
 
@@ -368,12 +394,15 @@ def compare_annotator_to_gt(
         gt_idx = 0
         ground_truth_scores = valid_docs_df["score_0"].to_list()
         predicted_scores = valid_docs_df["score_1"].to_list()
-    else:
+    elif annotators[1] == "gt":
         annotator_idx = 0
         gt_idx = 1
         ground_truth_scores = valid_docs_df["score_1"].to_list()
         predicted_scores = valid_docs_df["score_0"].to_list()
 
+    else:
+        raise ValueError(f"Expected one of the annotators to be 'gt', but found {annotators[0]} and {annotators[1]}")
+
     annotator_name = annotators[annotator_idx]
 
     gt_metrics = compute_gt_metrics(

diff --git a/src/ml_filter/analysis/utils.py b/src/ml_filter/analysis/utils.py
@@ -6,6 +6,8 @@
 
 import pandas as pd
 
+from ml_filter.utils.logging import get_logger
+
 
 def custom_round(x: int | float) -> int:
     """Rounds values > x.5 to x+1 and values < x.5 to x.
@@ -83,6 +85,10 @@ def get_document_scores_df(
         with open(file_path, "r") as f:
             for line in f:
                 json_obj = json.loads(line)
+                if "document_id" not in json_obj or json_obj["document_id"] is None:
+                    raise ValueError(
+                        f"Document ID is missing in the JSON object: {json_obj}. Please check the input file."
+                    )
 
                 # replace invalid scores with None
                 scores = []
@@ -124,6 +130,19 @@ def get_document_scores_df(
                 )
 
     document_scores_df = pd.DataFrame(document_scores)
+
+    # make sure that we have the same number of documents with the same doc_id for each annotator
+    doc_ids_per_annotator = document_scores_df.groupby(by=["annotator", "prompt", "prompt_lang"])["doc_id"].apply(set)
+    first_doc_ids = next(iter(doc_ids_per_annotator))
+    for index, doc_ids in zip(doc_ids_per_annotator.index, doc_ids_per_annotator):
+        if not doc_ids == first_doc_ids:
+            if len(doc_ids - first_doc_ids) > 0:
+                get_logger(name="main").warning(
+                    f"{'__'.join(doc_ids_per_annotator.index[0])} misses: {doc_ids - first_doc_ids}"
+                )
+            if len(first_doc_ids - doc_ids) > 0:
+                get_logger(name="main").warning(f"{'__'.join(index)} misses: {first_doc_ids - doc_ids}")
+
     return document_scores_df
 
 
@@ -173,6 +192,9 @@ def get_common_docs(document_scores_df: pd.DataFrame, annotator_0: str, annotato
     # only consider documents that are annotated by both annotators and have valid scores
     common_docs_df = pd.merge(annotator_0_df, annotator_1_df, on=["doc_id", "prompt"], suffixes=("_0", "_1"))
 
+    if len(common_docs_df) * 2 != len(document_scores_df):
+        get_logger(name="main").warning("Not all documents can be matched on columns doc_id and prompt.")
+
     # add rounded scores for each annotator
     for idx in (0, 1):
         common_docs_df[f"rounded_score_{idx}"] = common_docs_df[f"score_{idx}"].apply(round_scores)