Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25,860 changes: 25,817 additions & 43 deletions notebooks/edu_content_human_as_a_judge.ipynb
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few comments on the notebook:

  • You define jsonl_path = "annotations__educational_content__en__gt.jsonl". This fails for me because the file is missing. Should this be gt_annotations_path instead?
  • Why do we plot the standard deviations as a histogram, but the spread as a cumulative distribution?
  • In the section about spreads, why do we print Document ID 1 and 2? Shouldn't they be the same? Or is this just a sanity check?
  • For the evaluation of our predictions, we aggregate the human annotations using majority voting. In the notebook, we're only looking at the mean of the human annotations. Should we add info about the majority voting as well? E.g. we could add a plot of the distribution of the human annotations aggregated with majority voting.

The computations look correct to me.

Large diffs are not rendered by default.

Empty file added src/ml_filter/__init__.py
Empty file.
37 changes: 33 additions & 4 deletions src/ml_filter/analysis/interrater_reliability.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import numpy as np
import pandas as pd
from scipy.stats import kendalltau, spearmanr
from sklearn.metrics import cohen_kappa_score, f1_score
from sklearn.metrics import cohen_kappa_score, f1_score, ndcg_score, precision_score, recall_score
from statsmodels.stats.inter_rater import fleiss_kappa

from ml_filter.analysis.plot_score_distributions import plot_confusion_matrix
Expand Down Expand Up @@ -189,7 +189,33 @@ def compute_gt_metrics(
# Othwerwise, zipping will will proive the wrong results
class_f1_scores = f1_score(ground_truth_rounded, predictions_rounded, average=None, labels=valid_labels)
for valid_label, f1 in zip(valid_labels, class_f1_scores):
gt_metrics[f"F1-{valid_label}"] = f1
gt_metrics[f"F1-{valid_label}_vs_rest"] = f1

# f1 score at threshold
for t in np.array(list(range(5))) + 0.5:
ground_truth_rounded_bin = (np.array(ground_truth_rounded) >= t).astype(int)
predictions_rounded_bin = (np.array(predictions_rounded) >= t).astype(int)
gt_metrics[f"F1-{t}"] = f1_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)
gt_metrics[f"Recall-{t}"] = recall_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)
gt_metrics[f"Precision-{t}"] = precision_score(
ground_truth_rounded_bin,
predictions_rounded_bin,
labels=[int(valid_label) for valid_label in valid_labels],
zero_division=0,
)

# NDCG@all
gt_metrics["NDCG@all"] = ndcg_score(y_true=[ground_truth_scores], y_score=[predicted_scores], k=None)

return gt_metrics

Expand All @@ -215,7 +241,7 @@ def plot_invalid_docs_histogram(
plt.hist(correct_scores_of_invalid_docs, bins=[0, 0.5, 1.5, 2.5, 3.5, 4.5, 5.5], alpha=0.5, edgecolor="black")
plt.xlabel("Scores")
plt.ylabel("Frequency")
plt.title(f"Histogram of Invalid Scores for {annotator_name} and langauge {language}.")
plt.title(f"Histogram of invalid scores for {annotator_name} and language {language}.")
plt.grid(True)
plt.savefig(output_file_path)

Expand Down Expand Up @@ -368,12 +394,15 @@ def compare_annotator_to_gt(
gt_idx = 0
ground_truth_scores = valid_docs_df["score_0"].to_list()
predicted_scores = valid_docs_df["score_1"].to_list()
else:
elif annotators[1] == "gt":
annotator_idx = 0
gt_idx = 1
ground_truth_scores = valid_docs_df["score_1"].to_list()
predicted_scores = valid_docs_df["score_0"].to_list()

else:
raise ValueError(f"Expected one of the annotators to be 'gt', but found {annotators[0]} and {annotators[1]}")

annotator_name = annotators[annotator_idx]

gt_metrics = compute_gt_metrics(
Expand Down
22 changes: 22 additions & 0 deletions src/ml_filter/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import pandas as pd

from ml_filter.utils.logging import get_logger


def custom_round(x: int | float) -> int:
"""Rounds values > x.5 to x+1 and values < x.5 to x.
Expand Down Expand Up @@ -83,6 +85,10 @@ def get_document_scores_df(
with open(file_path, "r") as f:
for line in f:
json_obj = json.loads(line)
if "document_id" not in json_obj or json_obj["document_id"] is None:
raise ValueError(
f"Document ID is missing in the JSON object: {json_obj}. Please check the input file."
)

# replace invalid scores with None
scores = []
Expand Down Expand Up @@ -124,6 +130,19 @@ def get_document_scores_df(
)

document_scores_df = pd.DataFrame(document_scores)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is the filtering necessary? We handle the case of missing / ummatched document IDs later by filtering on documents that are common for the annotators we are currently comparing

# make sure that we have the same number of documents with the same doc_id for each annotator
doc_ids_per_annotator = document_scores_df.groupby(by=["annotator", "prompt", "prompt_lang"])["doc_id"].apply(set)
first_doc_ids = next(iter(doc_ids_per_annotator))
for index, doc_ids in zip(doc_ids_per_annotator.index, doc_ids_per_annotator):
if not doc_ids == first_doc_ids:
if len(doc_ids - first_doc_ids) > 0:
get_logger(name="main").warning(
f"{'__'.join(doc_ids_per_annotator.index[0])} misses: {doc_ids - first_doc_ids}"
)
if len(first_doc_ids - doc_ids) > 0:
get_logger(name="main").warning(f"{'__'.join(index)} misses: {first_doc_ids - doc_ids}")

return document_scores_df


Expand Down Expand Up @@ -173,6 +192,9 @@ def get_common_docs(document_scores_df: pd.DataFrame, annotator_0: str, annotato
# only consider documents that are annotated by both annotators and have valid scores
common_docs_df = pd.merge(annotator_0_df, annotator_1_df, on=["doc_id", "prompt"], suffixes=("_0", "_1"))

if len(common_docs_df) * 2 != len(document_scores_df):
get_logger(name="main").warning("Not all documents can be matched on columns doc_id and prompt.")

# add rounded scores for each annotator
for idx in (0, 1):
common_docs_df[f"rounded_score_{idx}"] = common_docs_df[f"score_{idx}"].apply(round_scores)
Expand Down