11import json
22import re
33from dataclasses import dataclass
4- from datetime import datetime , timezone
4+ from datetime import UTC , datetime
55from pathlib import Path
66
77import numpy as np
88import pandas as pd
9- from langchain_core .prompts import ChatPromptTemplate
109from langchain_core .language_models .llms import LLM
10+ from langchain_core .prompts import ChatPromptTemplate
1111
1212from judgearena .instruction_dataset import load_instructions
13- from judgearena .repro import write_run_metadata , _to_jsonable
13+ from judgearena .repro import _to_jsonable , write_run_metadata
1414from judgearena .utils import (
1515 compute_pref_summary ,
16- read_df ,
1716 data_root ,
18- download_hf ,
1917 do_inference ,
18+ download_hf ,
19+ read_df ,
2020)
2121
2222
@@ -55,13 +55,13 @@ def load_judge_system_and_user_prompt(
5555 provide_explanation : bool = True ,
5656) -> tuple [str , str ]:
5757 # Prepare judge
58- with open (Path (__file__ ).parent / "prompts" / "system-prompt.txt" , "r" ) as f :
58+ with open (Path (__file__ ).parent / "prompts" / "system-prompt.txt" ) as f :
5959 system_prompt = str (f .read ())
6060
6161 prompt_filename = (
6262 "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
6363 )
64- with open (Path (__file__ ).parent / "prompts" / prompt_filename , "r" ) as f :
64+ with open (Path (__file__ ).parent / "prompts" / prompt_filename ) as f :
6565 user_prompt_template = str (f .read ())
6666
6767 return system_prompt , user_prompt_template
@@ -109,7 +109,7 @@ def evaluate_completions(
109109 exceeding context limit
110110 :return:
111111 """
112- run_started_at = datetime .now (timezone . utc )
112+ run_started_at = datetime .now (UTC )
113113 local_path_tables = data_root / "tables"
114114 download_hf (name = dataset , local_path = local_path_tables )
115115
@@ -140,9 +140,9 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
140140 return df .loc [:, "output" ]
141141 else :
142142 print (f"Loading { method } from { dataset } dataset." )
143- assert (
144- method in df_outputs .columns
145- ), f"Method { method } not present, pick among { df_outputs . columns . tolist () } "
143+ assert method in df_outputs . columns , (
144+ f"Method { method } not present, pick among { df_outputs .columns . tolist () } "
145+ )
146146 return df_outputs .loc [:, method ].sort_index ()
147147
148148 completions_A = get_output (df_outputs = df_outputs , dataset = dataset , method = method_A )
@@ -151,9 +151,9 @@ def get_output(df_outputs: pd.DataFrame, dataset: str, method: str):
151151 instructions = instructions .head (num_annotations )
152152 completions_A = completions_A .head (num_annotations )
153153 completions_B = completions_B .head (num_annotations )
154- assert (
155- completions_A . index . tolist () == completions_B . index . tolist ()
156- ), f"Index mismatch between methods { method_A } and { method_B } ."
154+ assert completions_A . index . tolist () == completions_B . index . tolist (), (
155+ f"Index mismatch between methods { method_A } and { method_B } ."
156+ )
157157
158158 if judge_chat_model is None :
159159 from langchain_together .llms import Together
@@ -303,7 +303,7 @@ def truncate(s: str, max_len: int | None = None):
303303 "completion_B" : truncate (completion_B , max_len = truncate_input_chars ),
304304 }
305305 for user_prompt , completion_A , completion_B in zip (
306- instructions , completions_A , completions_B
306+ instructions , completions_A , completions_B , strict = True
307307 )
308308 ]
309309 )
@@ -316,7 +316,7 @@ def truncate(s: str, max_len: int | None = None):
316316
317317 annotations = []
318318 for judge_completion , instruction , completion_A , completion_B in zip (
319- judge_completions , instructions , completions_A , completions_B
319+ judge_completions , instructions , completions_A , completions_B , strict = True
320320 ):
321321 annotations .append (
322322 JudgeAnnotation (
@@ -381,7 +381,8 @@ def judge_and_parse_prefs(
381381 use_tqdm = use_tqdm ,
382382 )
383383
384- _none_to_nan = lambda x : float ("nan" ) if x is None else x
384+ def _none_to_nan (x ):
385+ return float ("nan" ) if x is None else x
385386
386387 score_parser = PairScore ()
387388 prefs = pd .Series (
0 commit comments