@@ -77,7 +77,6 @@ class TaskGrouping(t.TypedDict):
7777
7878
7979def evaluate_with_vllm (args : LeaderboardArgs ) -> t .Dict [str , t .Any ]:
80- os .environ ["HF_DATASETS_TRUST_REMOTE_CODE" ] = "true"
8180 results = simple_evaluate (
8281 tasks = args ["tasks" ],
8382 model = "vllm" ,
@@ -93,12 +92,12 @@ def evaluate_with_vllm(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
9392 apply_chat_template = True ,
9493 fewshot_as_multiturn = True ,
9594 batch_size = "auto" ,
95+ confirm_run_unsafe_code = True ,
9696 )
9797 return results
9898
9999
100100def worker (rank , world_size , args : LeaderboardArgs , result_queue : mp .Queue ):
101- os .environ ["HF_DATASETS_TRUST_REMOTE_CODE" ] = "true"
102101 os .environ ["RANK" ] = str (rank )
103102 os .environ ["WORLD_SIZE" ] = str (world_size )
104103 os .environ ["LOCAL_RANK" ] = str (rank )
@@ -122,17 +121,15 @@ def worker(rank, world_size, args: LeaderboardArgs, result_queue: mp.Queue):
122121 batch_size = "auto" ,
123122 device = f"cuda:{ device .index } " ,
124123 cache_requests = True ,
124+ confirm_run_unsafe_code = True ,
125125 )
126126
127- print (f"Rank { rank } got results: { type (results )} , putting them in the bucket" )
128127 result_queue .put ((rank , results ))
129- print (f"Rank { rank } done putting results in the bucket" )
130128
131129 # clear torch memory
132130 gc .collect ()
133131 torch .cuda .empty_cache ()
134132
135- print (f"Rank { rank } destroying process group" )
136133 dist .destroy_process_group ()
137134
138135
@@ -159,10 +156,8 @@ def evaluate_with_hf(args: LeaderboardArgs) -> t.Dict[str, t.Any]:
159156
160157 results = {}
161158 for _ in range (num_processes ):
162- print (f"[master] getting results from the bucket" )
163159 rank , result = result_queue .get ()
164160 results [rank ] = result
165- print (f"[master] got results from rank { rank } " )
166161
167162 # Wait for all processes to complete
168163 for p in processes :
@@ -209,7 +204,7 @@ def parse_multitask_results(
209204
210205 The end result is an unweighted average of all the subtasks, as well as a per-subtask breakdown.
211206 """
212- parsed_scores = {"score" : 0.0 , "subtasks" : {}}
207+ parsed_scores : ParsedScores = {"score" : 0.0 , "subtasks" : {}}
213208 subtask_scores = {}
214209 target_subtasks = result_dict ["group_subtasks" ].get (benchmark )
215210 if not target_subtasks :
@@ -335,7 +330,7 @@ def parse_math_hard(result_dict: t.Dict[str, t.Any]) -> ParsedScores:
335330 return parsed_scores
336331
337332
338- def get_parser (subtask : str ) -> t .Callable [[t .Dict , str , str ], ParsedScores ]:
333+ def get_parser (subtask : str ) -> t .Callable [[t .Dict [ str , t . Any ] ], ParsedScores ]:
339334 parser_map = {
340335 LeaderboardV2Tasks .BBH .value : parse_bbh ,
341336 LeaderboardV2Tasks .GPQA .value : parse_gpqa ,
@@ -349,13 +344,45 @@ def get_parser(subtask: str) -> t.Callable[[t.Dict, str, str], ParsedScores]:
349344 ] # this will either parse and map into the correct section, or error
350345
351346
347+ def build_leaderboard_v2_result (
348+ parsed_scores : t .Dict [str , ParsedScores ],
349+ ) -> LeaderboardV2EvalResult :
350+ """
351+ Build the leaderboard v2 result from the parsed scores.
352+ """
353+ # now let's build the overall score
354+ leaderboard_result : LeaderboardV2EvalResult = {
355+ "overall_score" : calculate_overall_leaderboard_score (parsed_scores ),
356+ }
357+
358+ # explicitly set the score for each subtask in order to satisfy mypy
359+ if "leaderboard_bbh" in parsed_scores :
360+ leaderboard_result ["leaderboard_bbh" ] = parsed_scores ["leaderboard_bbh" ]
361+ if "leaderboard_gpqa" in parsed_scores :
362+ leaderboard_result ["leaderboard_gpqa" ] = parsed_scores ["leaderboard_gpqa" ]
363+ if "leaderboard_ifeval" in parsed_scores :
364+ leaderboard_result ["leaderboard_ifeval" ] = parsed_scores ["leaderboard_ifeval" ]
365+ if "leaderboard_math_hard" in parsed_scores :
366+ leaderboard_result ["leaderboard_math_hard" ] = parsed_scores [
367+ "leaderboard_math_hard"
368+ ]
369+ if "leaderboard_mmlu_pro" in parsed_scores :
370+ leaderboard_result ["leaderboard_mmlu_pro" ] = parsed_scores [
371+ "leaderboard_mmlu_pro"
372+ ]
373+ if "leaderboard_musr" in parsed_scores :
374+ leaderboard_result ["leaderboard_musr" ] = parsed_scores ["leaderboard_musr" ]
375+
376+ return leaderboard_result
377+
378+
352379def get_scores_from_result_dicts (
353- * result_dicts : t .List [ t . Dict [str , t .Any ] ],
354- ) -> t . Dict [ str , ParsedScores ] :
380+ * result_dicts : t .Dict [str , t .Any ],
381+ ) -> LeaderboardV2EvalResult :
355382 """
356383 Parse out the scores of all the subtasks of leaderboard and return.
357384 """
358- parsed_scores = {}
385+ parsed_scores : t . Dict [ str , ParsedScores ] = {}
359386 for result_dict in result_dicts :
360387 benchmarks_we_got = set (result_dict ["results" ].keys ())
361388 benchmarks_we_care_about = set (
@@ -375,7 +402,7 @@ def get_scores_from_result_dicts(
375402 parse_benchmark_fn = get_parser (benchmark )
376403 parsed_scores [benchmark ] = parse_benchmark_fn (result_dict )
377404
378- return parsed_scores
405+ return build_leaderboard_v2_result ( parsed_scores )
379406
380407
381408def validate_output_path (output_file : str ) -> None :
@@ -453,9 +480,9 @@ class LeaderboardV2Evaluator(Evaluator):
453480 def __init__ (
454481 self ,
455482 model_path : str ,
456- tasks : t .List [str ] = None ,
457- num_gpus : int = None ,
458- output_file : str = None ,
483+ tasks : t .Optional [ t . List [str ] ] = None ,
484+ num_gpus : t . Optional [ int ] = None ,
485+ output_file : t . Optional [ str ] = None ,
459486 ):
460487 self .model_path = model_path
461488 if not cuda .is_available ():
@@ -469,11 +496,13 @@ def __init__(
469496
470497 # validate output file
471498 self .output_file = output_file
472- self ._results = None
473- self ._lm_eval_results = [] # TODO: make it merge everything back into a single result
499+ self ._results : t .Optional [LeaderboardV2EvalResult ] = None
500+ self ._lm_eval_results : t .List [
501+ t .Dict [str , t .Any ]
502+ ] = [] # TODO: make it merge everything back into a single result
474503
475504 @property
476- def results (self ) -> LeaderboardV2EvalResult :
505+ def results (self ) -> t . Optional [ LeaderboardV2EvalResult ] :
477506 """
478507 Returns the results of the most reccent leaderboard evaluation.
479508
@@ -492,7 +521,7 @@ def lm_eval_results(self) -> t.List[t.Dict[str, t.Any]]:
492521 """
493522 return self ._lm_eval_results
494523
495- def save_to_file (self , output_file : str = None ):
524+ def save_to_file (self , output_file : t . Optional [ str ] = None ) -> None :
496525 """
497526 Saves the results to a file.
498527
@@ -513,10 +542,10 @@ def save_to_file(self, output_file: str = None):
513542
514543 def run (
515544 self ,
516- model_path : str | None = None ,
517- tasks : t .List [str ] = None ,
518- num_gpus : int = None ,
519- output_file : str = None ,
545+ model_path : t . Optional [ str ] = None ,
546+ tasks : t .Optional [ t . List [str ] ] = None ,
547+ num_gpus : t . Optional [ int ] = None ,
548+ output_file : t . Optional [ str ] = None ,
520549 ) -> LeaderboardV2EvalResult :
521550 """
522551 Run the Open LLM Leaderboard v2 evaluation.
@@ -538,6 +567,9 @@ def run(
538567 num_gpus = self .num_gpus if not num_gpus else num_gpus
539568 output_file = self .output_file if not output_file else output_file
540569
570+ if not tasks :
571+ tasks = LEADERBOARD_V2_MCQ_TASKS + LEADERBOARD_V2_GENERATIVE_TASKS
572+
541573 # validation logic
542574 # no need to validate model path -- the inference libraries will either be able to
543575 # load it, or they won't
@@ -562,25 +594,26 @@ def run(
562594 self ._lm_eval_results = []
563595 vllm_results , hf_results = None , None
564596 if vllm_tasks := grouped_tasks ["vllm" ]:
565- args : LeaderboardArgs = {
597+ args_vllm : LeaderboardArgs = {
566598 "model_path" : model_path ,
567599 "num_gpus" : num_gpus ,
568600 "tasks" : vllm_tasks ,
569601 }
570- vllm_results = evaluate_with_vllm (args )
602+ vllm_results = evaluate_with_vllm (args_vllm )
571603 self ._lm_eval_results .append (vllm_results )
572604 if hf_tasks := grouped_tasks ["huggingface" ]:
573- args : LeaderboardArgs = {
605+ args_hf : LeaderboardArgs = {
574606 "model_path" : model_path ,
575607 "num_gpus" : num_gpus ,
576608 "tasks" : hf_tasks ,
577609 }
578- hf_results = evaluate_with_hf (args )
610+ hf_results = evaluate_with_hf (args_hf )
579611 self ._lm_eval_results .append (hf_results )
580612
581613 # convert the output of lm-eval into something that's already parsed
582- results = get_scores_from_result_dicts (* self ._lm_eval_results )
583- results ["overall_score" ] = calculate_overall_leaderboard_score (results )
614+ results : LeaderboardV2EvalResult = get_scores_from_result_dicts (
615+ * self ._lm_eval_results
616+ )
584617
585618 self ._results = results
586619 if output_file :
0 commit comments